[Openmp-commits] [openmp] [OpenMP][libomptarget] Enable parallel copies via multiple SDMA engines (PR #71801)

Thu Nov 9 04:19:00 PST 2023

https://github.com/jplehr created https://github.com/llvm/llvm-project/pull/71801

This enables the AMDGPU plugin to use a new ROCm 5.7 interface to dispatch asynchronous data transfers across SDMA engines.

The default functionality stays unchanged, meaning that all data transfers are enqueued into a H2D queue or an D2H queue, depending on transfer direction, via the HSA interface used previously.

The new interface can be enabled via the environment variable `LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES=true` when libomptarget is built against a recent ROCm version (5.7 and later).
As of now, requests are distributed in a round-robin fashion across available SDMA engines.

>From 2d550f3db7de666c6a9ddde2c2d644bcbc848e41 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Mon, 6 Nov 2023 04:35:04 -0500
Subject: [PATCH] [OpenMP][libomptarget] Enable parallel copies via multiple
 SDMA engines

This enables the AMDGPU plugin to use a new ROCm 5.7 interface to dispatch
asynchronous data transfers to multiple SDMA engines.

The default functionality stays unchanged, meaning that all data
transfers are enqueued into a H2D queue or an D2H queue, depending on
transfer direction, via the HSA interface used previously.

The new interface can be enabled via the environment variable
LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES=true when libomptarget is built
against a recent ROCm version (5.7 and later).
As of now, requests are distributed in a round-robin fashion across
available SDMA engines.
---
 .../amdgpu/dynamic_hsa/hsa_ext_amd.h          |  10 +-
 .../plugins-nextgen/amdgpu/src/rtl.cpp        | 105 +++++++++++-------
 2 files changed, 66 insertions(+), 49 deletions(-)

diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 188dd2600a610c6..2cebd0d35df088f 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -151,12 +151,10 @@ typedef struct hsa_amd_pointer_info_s {
   size_t sizeInBytes;
 } hsa_amd_pointer_info_t;
 
-hsa_status_t hsa_amd_pointer_info(const void* ptr,
-                                          hsa_amd_pointer_info_t* info,
-                                          void* (*alloc)(size_t),
-                                          uint32_t* num_agents_accessible,
-                                          hsa_agent_t** accessible);
-
+hsa_status_t hsa_amd_pointer_info(const void *ptr, hsa_amd_pointer_info_t *info,
+                                  void *(*alloc)(size_t),
+                                  uint32_t *num_agents_accessible,
+                                  hsa_agent_t **accessible);
 #ifdef __cplusplus
 }
 #endif
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 399a71390a65abe..9c2db4cdaf3bcd8 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -130,6 +130,40 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
                        "Error in hsa_amd_agent_iterate_memory_pools: %s");
 }
 
+/// Dispatches an asynchronous memory copy
+/// Enables different SDMA engines for the dispatch in a round-robin fashion.
+Error asyncMemCopy(void *Dst, hsa_agent_t DstAgent, const void *Src,
+                   hsa_agent_t SrcAgent, size_t Size, uint32_t NumDepSignals,
+                   const hsa_signal_t *DepSignals,
+                   hsa_signal_t CompletionSignal) {
+  static BoolEnvar OMPX_UseMultipleSdmaEngines{
+      "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false};
+  if (!OMPX_UseMultipleSdmaEngines) {
+    hsa_status_t S =
+        hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
+                                  NumDepSignals, DepSignals, CompletionSignal);
+    return Plugin::check(S, "Error in hsa_amd_memory_async_copy");
+  }
+
+// This solution is probably not the best
+#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 &&                                  \
+      HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
+  return Plugin::error("Async copy on selected SDMA requires ROCm 5.7");
+#else
+  static int SdmaEngine = 1;
+
+  // This call is only avail in ROCm >= 5.7
+  hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
+      Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
+      CompletionSignal, (hsa_amd_sdma_engine_id_t)SdmaEngine,
+      /*force_copy_on_sdma=*/true);
+  // Increment to use one of three SDMA engines: 0x1, 0x2, 0x4
+  SdmaEngine = (SdmaEngine << 1) % 7;
+
+  return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine");
+#endif
+}
+
 } // namespace utils
 
 /// Utility class representing generic resource references to AMDGPU resources.
@@ -1170,15 +1204,14 @@ struct AMDGPUStreamTy {
       InputSignal = nullptr;
 
     // Issue the async memory copy.
-    hsa_status_t Status;
     if (InputSignal) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 1,
-                                         &InputSignalRaw, OutputSignal->get());
-    } else
-      Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 0,
-                                         nullptr, OutputSignal->get());
-    return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
+      return utils::asyncMemCopy(Dst, Agent, Src, Agent, CopySize, 1,
+                                 &InputSignalRaw, OutputSignal->get());
+    }
+
+    return utils::asyncMemCopy(Dst, Agent, Src, Agent, CopySize, 0, nullptr,
+                               OutputSignal->get());
   }
 
   /// Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1214,21 +1247,18 @@ struct AMDGPUStreamTy {
 
     // Issue the first step: device to host transfer. Avoid defining the input
     // dependency if already satisfied.
-    hsa_status_t Status;
     if (InputSignal) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      Status =
-          hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 1,
-                                    &InputSignalRaw, OutputSignals[0]->get());
+      if (auto Err =
+              utils::asyncMemCopy(Inter, Agent, Src, Agent, CopySize, 1,
+                                  &InputSignalRaw, OutputSignals[0]->get()))
+        return Err;
     } else {
-      Status = hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 0,
-                                         nullptr, OutputSignals[0]->get());
+      if (auto Err = utils::asyncMemCopy(Inter, Agent, Src, Agent, CopySize, 0,
+                                         nullptr, OutputSignals[0]->get()))
+        return Err;
     }
 
-    if (auto Err =
-            Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
-      return Err;
-
     // Consume another stream slot and compute dependencies.
     std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
     assert(InputSignal && "Invalid input signal");
@@ -1242,7 +1272,7 @@ struct AMDGPUStreamTy {
     std::atomic_thread_fence(std::memory_order_release);
 
     // Issue the second step: host to host transfer.
-    Status = hsa_amd_signal_async_handler(
+    hsa_status_t Status = hsa_amd_signal_async_handler(
         InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
         (void *)&Slots[Curr]);
 
@@ -1318,16 +1348,13 @@ struct AMDGPUStreamTy {
 
     // Issue the second step: host to device transfer. Avoid defining the input
     // dependency if already satisfied.
-    hsa_status_t Status;
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 1,
-                                         &InputSignalRaw, OutputSignal->get());
-    } else
-      Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 0,
-                                         nullptr, OutputSignal->get());
-
-    return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
+      return utils::asyncMemCopy(Dst, Agent, Inter, Agent, CopySize, 1,
+                                 &InputSignalRaw, OutputSignal->get());
+    }
+    return utils::asyncMemCopy(Dst, Agent, Inter, Agent, CopySize, 0, nullptr,
+                               OutputSignal->get());
   }
 
   // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1353,17 +1380,13 @@ struct AMDGPUStreamTy {
     // allocated by this runtime or the caller made the appropriate
     // access calls.
 
-    hsa_status_t Status;
     if (InputSignal && InputSignal->load()) {
       hsa_signal_t InputSignalRaw = InputSignal->get();
-      Status =
-          hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize, 1,
-                                    &InputSignalRaw, OutputSignal->get());
-    } else
-      Status = hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize,
-                                         0, nullptr, OutputSignal->get());
-
-    return Plugin::check(Status, "Error in D2D hsa_amd_memory_async_copy: %s");
+      return utils::asyncMemCopy(Dst, DstAgent, Src, SrcAgent, CopySize, 1,
+                                 &InputSignalRaw, OutputSignal->get());
+    }
+    return utils::asyncMemCopy(Dst, DstAgent, Src, SrcAgent, CopySize, 0,
+                               nullptr, OutputSignal->get());
   }
 
   /// Synchronize with the stream. The current thread waits until all operations
@@ -2196,10 +2219,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       if (auto Err = Signal.init())
         return Err;
 
-      Status = hsa_amd_memory_async_copy(TgtPtr, Agent, PinnedPtr, Agent, Size,
-                                         0, nullptr, Signal.get());
-      if (auto Err =
-              Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
+      if (auto Err = utils::asyncMemCopy(TgtPtr, Agent, PinnedPtr, Agent, Size,
+                                         0, nullptr, Signal.get()))
         return Err;
 
       if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
@@ -2257,10 +2278,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       if (auto Err = Signal.init())
         return Err;
 
-      Status = hsa_amd_memory_async_copy(PinnedPtr, Agent, TgtPtr, Agent, Size,
-                                         0, nullptr, Signal.get());
-      if (auto Err =
-              Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
+      if (auto Err = utils::asyncMemCopy(PinnedPtr, Agent, TgtPtr, Agent, Size,
+                                         0, nullptr, Signal.get()))
         return Err;
 
       if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))