[Openmp-commits] [openmp] [OpenMP][libomptarget] Enable parallel copies via multiple SDMA engines (PR #71801)
Jan Patrick Lehr via Openmp-commits
openmp-commits at lists.llvm.org
Thu Nov 9 04:19:00 PST 2023
https://github.com/jplehr created https://github.com/llvm/llvm-project/pull/71801
This enables the AMDGPU plugin to use a new ROCm 5.7 interface to dispatch asynchronous data transfers across SDMA engines.
The default functionality stays unchanged, meaning that all data transfers are enqueued into a H2D queue or an D2H queue, depending on transfer direction, via the HSA interface used previously.
The new interface can be enabled via the environment variable `LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES=true` when libomptarget is built against a recent ROCm version (5.7 and later).
As of now, requests are distributed in a round-robin fashion across available SDMA engines.
>From 2d550f3db7de666c6a9ddde2c2d644bcbc848e41 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Mon, 6 Nov 2023 04:35:04 -0500
Subject: [PATCH] [OpenMP][libomptarget] Enable parallel copies via multiple
SDMA engines
This enables the AMDGPU plugin to use a new ROCm 5.7 interface to dispatch
asynchronous data transfers to multiple SDMA engines.
The default functionality stays unchanged, meaning that all data
transfers are enqueued into a H2D queue or an D2H queue, depending on
transfer direction, via the HSA interface used previously.
The new interface can be enabled via the environment variable
LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES=true when libomptarget is built
against a recent ROCm version (5.7 and later).
As of now, requests are distributed in a round-robin fashion across
available SDMA engines.
---
.../amdgpu/dynamic_hsa/hsa_ext_amd.h | 10 +-
.../plugins-nextgen/amdgpu/src/rtl.cpp | 105 +++++++++++-------
2 files changed, 66 insertions(+), 49 deletions(-)
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 188dd2600a610c6..2cebd0d35df088f 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -151,12 +151,10 @@ typedef struct hsa_amd_pointer_info_s {
size_t sizeInBytes;
} hsa_amd_pointer_info_t;
-hsa_status_t hsa_amd_pointer_info(const void* ptr,
- hsa_amd_pointer_info_t* info,
- void* (*alloc)(size_t),
- uint32_t* num_agents_accessible,
- hsa_agent_t** accessible);
-
+hsa_status_t hsa_amd_pointer_info(const void *ptr, hsa_amd_pointer_info_t *info,
+ void *(*alloc)(size_t),
+ uint32_t *num_agents_accessible,
+ hsa_agent_t **accessible);
#ifdef __cplusplus
}
#endif
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 399a71390a65abe..9c2db4cdaf3bcd8 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -130,6 +130,40 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
"Error in hsa_amd_agent_iterate_memory_pools: %s");
}
+/// Dispatches an asynchronous memory copy
+/// Enables different SDMA engines for the dispatch in a round-robin fashion.
+Error asyncMemCopy(void *Dst, hsa_agent_t DstAgent, const void *Src,
+ hsa_agent_t SrcAgent, size_t Size, uint32_t NumDepSignals,
+ const hsa_signal_t *DepSignals,
+ hsa_signal_t CompletionSignal) {
+ static BoolEnvar OMPX_UseMultipleSdmaEngines{
+ "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false};
+ if (!OMPX_UseMultipleSdmaEngines) {
+ hsa_status_t S =
+ hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
+ NumDepSignals, DepSignals, CompletionSignal);
+ return Plugin::check(S, "Error in hsa_amd_memory_async_copy");
+ }
+
+// This solution is probably not the best
+#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
+ HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
+ return Plugin::error("Async copy on selected SDMA requires ROCm 5.7");
+#else
+ static int SdmaEngine = 1;
+
+ // This call is only avail in ROCm >= 5.7
+ hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
+ Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
+ CompletionSignal, (hsa_amd_sdma_engine_id_t)SdmaEngine,
+ /*force_copy_on_sdma=*/true);
+ // Increment to use one of three SDMA engines: 0x1, 0x2, 0x4
+ SdmaEngine = (SdmaEngine << 1) % 7;
+
+ return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine");
+#endif
+}
+
} // namespace utils
/// Utility class representing generic resource references to AMDGPU resources.
@@ -1170,15 +1204,14 @@ struct AMDGPUStreamTy {
InputSignal = nullptr;
// Issue the async memory copy.
- hsa_status_t Status;
if (InputSignal) {
hsa_signal_t InputSignalRaw = InputSignal->get();
- Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 1,
- &InputSignalRaw, OutputSignal->get());
- } else
- Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 0,
- nullptr, OutputSignal->get());
- return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
+ return utils::asyncMemCopy(Dst, Agent, Src, Agent, CopySize, 1,
+ &InputSignalRaw, OutputSignal->get());
+ }
+
+ return utils::asyncMemCopy(Dst, Agent, Src, Agent, CopySize, 0, nullptr,
+ OutputSignal->get());
}
/// Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1214,21 +1247,18 @@ struct AMDGPUStreamTy {
// Issue the first step: device to host transfer. Avoid defining the input
// dependency if already satisfied.
- hsa_status_t Status;
if (InputSignal) {
hsa_signal_t InputSignalRaw = InputSignal->get();
- Status =
- hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 1,
- &InputSignalRaw, OutputSignals[0]->get());
+ if (auto Err =
+ utils::asyncMemCopy(Inter, Agent, Src, Agent, CopySize, 1,
+ &InputSignalRaw, OutputSignals[0]->get()))
+ return Err;
} else {
- Status = hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 0,
- nullptr, OutputSignals[0]->get());
+ if (auto Err = utils::asyncMemCopy(Inter, Agent, Src, Agent, CopySize, 0,
+ nullptr, OutputSignals[0]->get()))
+ return Err;
}
- if (auto Err =
- Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
- return Err;
-
// Consume another stream slot and compute dependencies.
std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
assert(InputSignal && "Invalid input signal");
@@ -1242,7 +1272,7 @@ struct AMDGPUStreamTy {
std::atomic_thread_fence(std::memory_order_release);
// Issue the second step: host to host transfer.
- Status = hsa_amd_signal_async_handler(
+ hsa_status_t Status = hsa_amd_signal_async_handler(
InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
(void *)&Slots[Curr]);
@@ -1318,16 +1348,13 @@ struct AMDGPUStreamTy {
// Issue the second step: host to device transfer. Avoid defining the input
// dependency if already satisfied.
- hsa_status_t Status;
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
- Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 1,
- &InputSignalRaw, OutputSignal->get());
- } else
- Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 0,
- nullptr, OutputSignal->get());
-
- return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
+ return utils::asyncMemCopy(Dst, Agent, Inter, Agent, CopySize, 1,
+ &InputSignalRaw, OutputSignal->get());
+ }
+ return utils::asyncMemCopy(Dst, Agent, Inter, Agent, CopySize, 0, nullptr,
+ OutputSignal->get());
}
// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1353,17 +1380,13 @@ struct AMDGPUStreamTy {
// allocated by this runtime or the caller made the appropriate
// access calls.
- hsa_status_t Status;
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
- Status =
- hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize, 1,
- &InputSignalRaw, OutputSignal->get());
- } else
- Status = hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize,
- 0, nullptr, OutputSignal->get());
-
- return Plugin::check(Status, "Error in D2D hsa_amd_memory_async_copy: %s");
+ return utils::asyncMemCopy(Dst, DstAgent, Src, SrcAgent, CopySize, 1,
+ &InputSignalRaw, OutputSignal->get());
+ }
+ return utils::asyncMemCopy(Dst, DstAgent, Src, SrcAgent, CopySize, 0,
+ nullptr, OutputSignal->get());
}
/// Synchronize with the stream. The current thread waits until all operations
@@ -2196,10 +2219,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = Signal.init())
return Err;
- Status = hsa_amd_memory_async_copy(TgtPtr, Agent, PinnedPtr, Agent, Size,
- 0, nullptr, Signal.get());
- if (auto Err =
- Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
+ if (auto Err = utils::asyncMemCopy(TgtPtr, Agent, PinnedPtr, Agent, Size,
+ 0, nullptr, Signal.get()))
return Err;
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2257,10 +2278,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = Signal.init())
return Err;
- Status = hsa_amd_memory_async_copy(PinnedPtr, Agent, TgtPtr, Agent, Size,
- 0, nullptr, Signal.get());
- if (auto Err =
- Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
+ if (auto Err = utils::asyncMemCopy(PinnedPtr, Agent, TgtPtr, Agent, Size,
+ 0, nullptr, Signal.get()))
return Err;
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
More information about the Openmp-commits
mailing list