[Openmp-commits] [openmp] [OpenMP] [amdgpu] Added a synchronous version of data exchange. (PR #87032)
via Openmp-commits
openmp-commits at lists.llvm.org
Thu Mar 28 20:19:44 PDT 2024
https://github.com/dhruvachak updated https://github.com/llvm/llvm-project/pull/87032
>From 962bb4cf0d572b12cc653104623af7983326076e Mon Sep 17 00:00:00 2001
From: Dhruva Chakrabarti <Dhruva.Chakrabarti at amd.com>
Date: Thu, 28 Mar 2024 21:23:38 -0400
Subject: [PATCH] [OpenMP] [amdgpu] Added a synchronous version of data
exchange.
Similar to H2D and D2H, use synchronous mode for large data transfers
beyond a certain size for D2D as well. As with H2D and D2H, this size is
controlled by an env-var.
---
.../plugins-nextgen/amdgpu/src/rtl.cpp | 21 ++++++
.../test/offloading/d2d_memcpy_sync.c | 67 +++++++++++++++++++
2 files changed, 88 insertions(+)
create mode 100644 openmp/libomptarget/test/offloading/d2d_memcpy_sync.c
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 2dd08dd5d0b4ea..a0fdde951b74a7 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2402,6 +2402,27 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
+ // For large transfers use synchronous behavior.
+ if (Size >= OMPX_MaxAsyncCopyBytes) {
+ if (AsyncInfoWrapper.hasQueue())
+ if (auto Err = synchronize(AsyncInfoWrapper))
+ return Err;
+
+ AMDGPUSignalTy Signal;
+ if (auto Err = Signal.init())
+ return Err;
+
+ if (auto Err = utils::asyncMemCopy(
+ useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
+ getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
+ return Err;
+
+ if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
+ return Err;
+
+ return Signal.deinit();
+ }
+
AMDGPUStreamTy *Stream = nullptr;
if (auto Err = getStream(AsyncInfoWrapper, Stream))
return Err;
diff --git a/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c b/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c
new file mode 100644
index 00000000000000..a768cd1209ac52
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c
@@ -0,0 +1,67 @@
+// RUN: %libomptarget-compile-generic && \
+// RUN: env LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES=0 %libomptarget-run-generic | \
+// RUN: %fcheck-generic -allow-empty
+// REQUIRES: amdgcn-amd-amdhsa
+
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+const int magic_num = 7;
+
+int main(int argc, char *argv[]) {
+ const int N = 128;
+ const int num_devices = omp_get_num_devices();
+
+ // No target device, just return
+ if (num_devices == 0) {
+ printf("PASS\n");
+ return 0;
+ }
+
+ const int src_device = 0;
+ int dst_device = num_devices - 1;
+
+ int length = N * sizeof(int);
+ int *src_ptr = omp_target_alloc(length, src_device);
+ int *dst_ptr = omp_target_alloc(length, dst_device);
+
+ assert(src_ptr && "src_ptr is NULL");
+ assert(dst_ptr && "dst_ptr is NULL");
+
+#pragma omp target teams distribute parallel for device(src_device) \
+ is_device_ptr(src_ptr)
+ for (int i = 0; i < N; ++i) {
+ src_ptr[i] = magic_num;
+ }
+
+ int rc =
+ omp_target_memcpy(dst_ptr, src_ptr, length, 0, 0, dst_device, src_device);
+
+ assert(rc == 0 && "error in omp_target_memcpy");
+
+ int *buffer = malloc(length);
+
+ assert(buffer && "failed to allocate host buffer");
+
+#pragma omp target teams distribute parallel for device(dst_device) \
+ map(from : buffer[0 : N]) is_device_ptr(dst_ptr)
+ for (int i = 0; i < N; ++i) {
+ buffer[i] = dst_ptr[i] + magic_num;
+ }
+
+ for (int i = 0; i < N; ++i)
+ assert(buffer[i] == 2 * magic_num);
+
+ printf("PASS\n");
+
+ // Free host and device memory
+ free(buffer);
+ omp_target_free(src_ptr, src_device);
+ omp_target_free(dst_ptr, dst_device);
+
+ return 0;
+}
+
+// CHECK: PASS
More information about the Openmp-commits
mailing list