[Openmp-commits] [openmp] [OpenMP][libomptarget] Enable parallel copies via multiple SDMA engines (PR #71801)
Jan Patrick Lehr via Openmp-commits
openmp-commits at lists.llvm.org
Thu Nov 9 14:11:54 PST 2023
https://github.com/jplehr updated https://github.com/llvm/llvm-project/pull/71801
>From 02fada7bd41db4fd5a4c6ce0ab15615969083a03 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 9 Nov 2023 15:17:37 -0500
Subject: [PATCH 1/2] [OpenMP][libomptarget] Enable parallel copies via
multiple SDMA engines
This enables the AMDGPU plugin to use a new ROCm 5.7 interface to
dispatch asynchronous data transfers across SDMA engines.
The default functionality stays unchanged, meaning that all data
transfers are enqueued into a H2D queue or an D2H queue, depending on
transfer direction, via the HSA interface used previously.
The new interface can be enabled via the environment variable
LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES=true when libomptarget is
built against a recent ROCm version (5.7 and later).
As of now, requests are distributed in a round-robin fashion across
available SDMA engines.
---
.../plugins-nextgen/amdgpu/src/rtl.cpp | 123 +++++++++++-------
1 file changed, 79 insertions(+), 44 deletions(-)
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 399a71390a65abe..15778380f870b6a 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -130,6 +130,38 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
"Error in hsa_amd_agent_iterate_memory_pools: %s");
}
+/// Dispatches an asynchronous memory copy
+/// Enables different SDMA engines for the dispatch in a round-robin fashion.
+Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
+ const void *Src, hsa_agent_t SrcAgent, size_t Size,
+ uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
+ hsa_signal_t CompletionSignal) {
+ if (UseMultipleSdmaEngines) {
+ hsa_status_t S =
+ hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
+ NumDepSignals, DepSignals, CompletionSignal);
+ return Plugin::check(S, "Error in hsa_amd_memory_async_copy");
+ }
+
+// This solution is probably not the best
+#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
+ HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
+ return Plugin::error("Async copy on selected SDMA requires ROCm 5.7");
+#else
+ static int SdmaEngine = 1;
+
+ // This call is only avail in ROCm >= 5.7
+ hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
+ Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
+ CompletionSignal, (hsa_amd_sdma_engine_id_t)SdmaEngine,
+ /*force_copy_on_sdma=*/true);
+ // Increment to use one of three SDMA engines: 0x1, 0x2, 0x4
+ SdmaEngine = (SdmaEngine << 1) % 7;
+
+ return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine");
+#endif
+}
+
} // namespace utils
/// Utility class representing generic resource references to AMDGPU resources.
@@ -945,6 +977,9 @@ struct AMDGPUStreamTy {
/// Timeout hint for HSA actively waiting for signal value to change
const uint64_t StreamBusyWaitMicroseconds;
+ /// Indicate to spread data transfers across all avilable SDMAs
+ bool UseMultipleSdmaEngines;
+
/// Return the current number of asychronous operations on the stream.
uint32_t size() const { return NextSlot; }
@@ -1170,15 +1205,15 @@ struct AMDGPUStreamTy {
InputSignal = nullptr;
// Issue the async memory copy.
- hsa_status_t Status;
if (InputSignal) {
hsa_signal_t InputSignalRaw = InputSignal->get();
- Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 1,
- &InputSignalRaw, OutputSignal->get());
- } else
- Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 0,
- nullptr, OutputSignal->get());
- return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
+ return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
+ CopySize, 1, &InputSignalRaw,
+ OutputSignal->get());
+ }
+
+ return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
+ CopySize, 0, nullptr, OutputSignal->get());
}
/// Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1214,21 +1249,19 @@ struct AMDGPUStreamTy {
// Issue the first step: device to host transfer. Avoid defining the input
// dependency if already satisfied.
- hsa_status_t Status;
if (InputSignal) {
hsa_signal_t InputSignalRaw = InputSignal->get();
- Status =
- hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 1,
- &InputSignalRaw, OutputSignals[0]->get());
+ if (auto Err = utils::asyncMemCopy(
+ UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
+ &InputSignalRaw, OutputSignals[0]->get()))
+ return Err;
} else {
- Status = hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 0,
- nullptr, OutputSignals[0]->get());
+ if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent,
+ Src, Agent, CopySize, 0, nullptr,
+ OutputSignals[0]->get()))
+ return Err;
}
- if (auto Err =
- Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
- return Err;
-
// Consume another stream slot and compute dependencies.
std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
assert(InputSignal && "Invalid input signal");
@@ -1242,7 +1275,7 @@ struct AMDGPUStreamTy {
std::atomic_thread_fence(std::memory_order_release);
// Issue the second step: host to host transfer.
- Status = hsa_amd_signal_async_handler(
+ hsa_status_t Status = hsa_amd_signal_async_handler(
InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
(void *)&Slots[Curr]);
@@ -1318,16 +1351,14 @@ struct AMDGPUStreamTy {
// Issue the second step: host to device transfer. Avoid defining the input
// dependency if already satisfied.
- hsa_status_t Status;
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
- Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 1,
- &InputSignalRaw, OutputSignal->get());
- } else
- Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 0,
- nullptr, OutputSignal->get());
-
- return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
+ return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
+ Agent, CopySize, 1, &InputSignalRaw,
+ OutputSignal->get());
+ }
+ return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
+ CopySize, 0, nullptr, OutputSignal->get());
}
// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1353,17 +1384,15 @@ struct AMDGPUStreamTy {
// allocated by this runtime or the caller made the appropriate
// access calls.
- hsa_status_t Status;
if (InputSignal && InputSignal->load()) {
hsa_signal_t InputSignalRaw = InputSignal->get();
- Status =
- hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize, 1,
- &InputSignalRaw, OutputSignal->get());
- } else
- Status = hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize,
- 0, nullptr, OutputSignal->get());
-
- return Plugin::check(Status, "Error in D2D hsa_amd_memory_async_copy: %s");
+ return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
+ SrcAgent, CopySize, 1, &InputSignalRaw,
+ OutputSignal->get());
+ }
+ return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
+ SrcAgent, CopySize, 0, nullptr,
+ OutputSignal->get());
}
/// Synchronize with the stream. The current thread waits until all operations
@@ -1788,6 +1817,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
64),
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
+ OMPX_UseMultipleSdmaEngines(
+ "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
@@ -2196,10 +2227,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = Signal.init())
return Err;
- Status = hsa_amd_memory_async_copy(TgtPtr, Agent, PinnedPtr, Agent, Size,
- 0, nullptr, Signal.get());
- if (auto Err =
- Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
+ if (auto Err = utils::asyncMemCopy(getUseMultipleSdmaEngines(), TgtPtr,
+ Agent, PinnedPtr, Agent, Size, 0,
+ nullptr, Signal.get()))
return Err;
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2257,10 +2287,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = Signal.init())
return Err;
- Status = hsa_amd_memory_async_copy(PinnedPtr, Agent, TgtPtr, Agent, Size,
- 0, nullptr, Signal.get());
- if (auto Err =
- Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
+ if (auto Err = utils::asyncMemCopy(getUseMultipleSdmaEngines(), PinnedPtr,
+ Agent, TgtPtr, Agent, Size, 0, nullptr,
+ Signal.get()))
return Err;
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2623,6 +2652,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
});
}
+ bool getUseMultipleSdmaEngines() { return OMPX_UseMultipleSdmaEngines; }
+
private:
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
@@ -2660,6 +2691,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// are microseconds.
UInt32Envar OMPX_StreamBusyWait;
+ /// Use ROCm 5.7 interface for multiple SDMA engines
+ BoolEnvar OMPX_UseMultipleSdmaEngines;
+
/// Stream manager for AMDGPU streams.
AMDGPUStreamManagerTy AMDGPUStreamManager;
@@ -2761,7 +2795,8 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
SignalManager(Device.getSignalManager()), Device(Device),
// Initialize the std::deque with some empty positions.
Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
- StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
+ StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
+ UseMultipleSdmaEngines(Device.getUseMultipleSdmaEngines()) {}
/// Class implementing the AMDGPU-specific functionalities of the global
/// handler.
>From 9f6224d014502c00fb9cd4e7b929ca159f751206 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 9 Nov 2023 17:11:08 -0500
Subject: [PATCH 2/2] fixup! [OpenMP][libomptarget] Enable parallel copies via
multiple SDMA engines
---
.../plugins-nextgen/amdgpu/src/rtl.cpp | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 15778380f870b6a..a4df76d56f2bde5 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -130,7 +130,7 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
"Error in hsa_amd_agent_iterate_memory_pools: %s");
}
-/// Dispatches an asynchronous memory copy
+/// Dispatches an asynchronous memory copy.
/// Enables different SDMA engines for the dispatch in a round-robin fashion.
Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
const void *Src, hsa_agent_t SrcAgent, size_t Size,
@@ -140,7 +140,7 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
hsa_status_t S =
hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
NumDepSignals, DepSignals, CompletionSignal);
- return Plugin::check(S, "Error in hsa_amd_memory_async_copy");
+ return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s");
}
// This solution is probably not the best
@@ -158,7 +158,7 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
// Increment to use one of three SDMA engines: 0x1, 0x2, 0x4
SdmaEngine = (SdmaEngine << 1) % 7;
- return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine");
+ return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s");
#endif
}
@@ -2227,7 +2227,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = Signal.init())
return Err;
- if (auto Err = utils::asyncMemCopy(getUseMultipleSdmaEngines(), TgtPtr,
+ if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
Agent, PinnedPtr, Agent, Size, 0,
nullptr, Signal.get()))
return Err;
@@ -2287,7 +2287,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = Signal.init())
return Err;
- if (auto Err = utils::asyncMemCopy(getUseMultipleSdmaEngines(), PinnedPtr,
+ if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr,
Agent, TgtPtr, Agent, Size, 0, nullptr,
Signal.get()))
return Err;
@@ -2652,7 +2652,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
});
}
- bool getUseMultipleSdmaEngines() { return OMPX_UseMultipleSdmaEngines; }
+ bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; }
private:
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
@@ -2796,7 +2796,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
// Initialize the std::deque with some empty positions.
Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
- UseMultipleSdmaEngines(Device.getUseMultipleSdmaEngines()) {}
+ UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
/// Class implementing the AMDGPU-specific functionalities of the global
/// handler.
More information about the Openmp-commits
mailing list