[Openmp-commits] [openmp] [OpenMP][libomptarget] Enable automatic unified shared memory executi… (PR #75999)

via Openmp-commits openmp-commits at lists.llvm.org
Tue Dec 19 19:19:46 PST 2023


https://github.com/carlobertolli created https://github.com/llvm/llvm-project/pull/75999

…on (zero-copy) on MI300A.

This patch enables applications that did not request OpenMP unified_shared_memory to run with the same zero-copy behavior, where mapped memory does not result in extra memory allocations and memory copies, but CPU-allocated memory is accessed from the device. The name for this behavior is "automatic zero-copy" and it relies on detecting: that the runtime is running on a MI300A, that the user did not select unified_shared_memory in their program, and that XNACK (unified memory support) is enabled in the current GPU configuration. If all these conditions are met, then automatic zero-copy is triggered.

This patch is still missing support for global variables, which will be provided in a subsequent patch.

>From 15235fb8c75d79712a844326f179a3ca02a73ebd Mon Sep 17 00:00:00 2001
From: Carlo Bertolli <carlo.bertolli at amd.com>
Date: Tue, 19 Dec 2023 15:38:40 -0600
Subject: [PATCH] [OpenMP][libomptarget] Enable automatic unified shared memory
 execution (zero-copy) on MI300A.

This patch enables applications that did not request OpenMP unified_shared_memory to run with the same zero-copy behavior, where mapped memory does not result in extra memory allocations and memory copies, but CPU-allocated memory is accessed from the device.
The name for this behavior is "automatic zero-copy" and it relies on detecting: that the runtime is running on a MI300A, that the user did not select unified_shared_memory in their program, and that XNACK (unified memory support) is enabled in the current
GPU configuration. If all these conditions are met, then automatic zero-copy is triggered.

This patch is still missing support for global variables, which will be provided in a subsequent patch.

Co-authored-by: Thorsten Blass <thorsten.blass at amd.com>
Co-authored-by: Carlo Bertolli <carlo.bertolli at amd.com>
---
 openmp/libomptarget/include/PluginManager.h   | 12 ++++
 .../libomptarget/include/Shared/PluginAPI.h   |  1 +
 .../libomptarget/include/Shared/PluginAPI.inc |  1 +
 .../plugins-nextgen/amdgpu/src/rtl.cpp        | 66 ++++++++++++++++++-
 .../amdgpu/utils/UtilitiesRTL.h               | 28 ++++++++
 .../common/include/PluginInterface.h          |  4 ++
 .../common/src/PluginInterface.cpp            |  3 +
 openmp/libomptarget/src/OpenMP/Mapping.cpp    |  8 ++-
 openmp/libomptarget/src/PluginManager.cpp     |  5 ++
 .../test/mapping/auto_zero_copy.cpp           | 43 ++++++++++++
 10 files changed, 167 insertions(+), 4 deletions(-)
 create mode 100644 openmp/libomptarget/test/mapping/auto_zero_copy.cpp

diff --git a/openmp/libomptarget/include/PluginManager.h b/openmp/libomptarget/include/PluginManager.h
index a0499c37504c0d..a1bc98aa5d7a1f 100644
--- a/openmp/libomptarget/include/PluginManager.h
+++ b/openmp/libomptarget/include/PluginManager.h
@@ -190,6 +190,11 @@ struct PluginManager {
   /// Add \p Flags to the user provided requirements.
   void addRequirements(int64_t Flags) { Requirements.addRequirements(Flags); }
 
+  bool getUseAutoZeroCopy() const { return UseAutoZeroCopy; }
+  void setUseAutoZeroCopy(bool DetectedUseAutoZeroCopy) {
+    UseAutoZeroCopy = DetectedUseAutoZeroCopy;
+  }
+
 private:
   bool RTLsLoaded = false;
   llvm::SmallVector<__tgt_bin_desc *> DelayedBinDesc;
@@ -208,6 +213,13 @@ struct PluginManager {
 
   /// Devices associated with plugins, accesses to the container are exclusive.
   ProtectedObj<DeviceContainerTy> Devices;
+
+  /// Whe running on an APU, the GPU plugin may decide to
+  /// run in zero-copy even though the user did not program
+  /// their application with unified_shared_memory requirement.
+  /// This execution modality is called auto zero-copy and it is
+  /// cached information from the Plugin.
+  bool UseAutoZeroCopy = false;
 };
 
 extern PluginManager *PM;
diff --git a/openmp/libomptarget/include/Shared/PluginAPI.h b/openmp/libomptarget/include/Shared/PluginAPI.h
index 41d1908da21532..a8c3f337036d3d 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.h
+++ b/openmp/libomptarget/include/Shared/PluginAPI.h
@@ -225,6 +225,7 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
                                            void *VAddr, bool isRecord,
                                            bool SaveOutput,
                                            uint64_t &ReqPtrArgOffset);
+int32_t __tgt_rtl_use_auto_zero_copy();
 }
 
 #endif // OMPTARGET_SHARED_PLUGIN_API_H
diff --git a/openmp/libomptarget/include/Shared/PluginAPI.inc b/openmp/libomptarget/include/Shared/PluginAPI.inc
index 0949e4e593ddeb..575158297c89b6 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.inc
+++ b/openmp/libomptarget/include/Shared/PluginAPI.inc
@@ -48,3 +48,4 @@ PLUGIN_API_HANDLE(data_notify_mapped, false);
 PLUGIN_API_HANDLE(data_notify_unmapped, false);
 PLUGIN_API_HANDLE(set_device_offset, false);
 PLUGIN_API_HANDLE(initialize_record_replay, false);
+PLUGIN_API_HANDLE(use_auto_zero_copy, false);
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 0ffdabe5bcd420..5636643fb111b6 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2913,7 +2913,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
   /// Create an AMDGPU plugin and initialize the AMDGPU driver.
   AMDGPUPluginTy()
       : GenericPluginTy(getTripleArch()), Initialized(false),
-        HostDevice(nullptr) {}
+        HostDevice(nullptr), IsAPU(false), OMPX_HSAXnack("HSA_XNACK", false) {}
 
   /// This class should not be copied.
   AMDGPUPluginTy(const AMDGPUPluginTy &) = delete;
@@ -2990,6 +2990,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     if (auto Err = HostDevice->init())
       return std::move(Err);
 
+    isAPU();
+
     return NumDevices;
   }
 
@@ -3065,6 +3067,21 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     return KernelAgents;
   }
 
+  /// Returns true if auto zero-copy the best configuration for the current arch
+  /// and binary.
+  bool useAutoZeroCopy() override final {
+    // Automatic zero-copy only applies to programs that did
+    // not request unified_shared_memory and are deployed on an
+    // APU with XNACK enabled.
+    if (getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY)
+      return false;
+
+    // XNACK can be enabled with with kernel boot parameter or with
+    // environment variable.
+    return (IsAPU &&
+            (utils::IsXnackEnabledViaKernelParam() || OMPX_HSAXnack.get()));
+  }
+
 private:
   /// Event handler that will be called by ROCr if an event is detected.
   static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
@@ -3109,6 +3126,47 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     return HSA_STATUS_ERROR;
   }
 
+  /// Detect if current architecture is an APU.
+  void isAPU() {
+    char GfxName[64];
+
+    if (!KernelAgents.size())
+      return;
+
+    // Do not allow for mixed APU+discrete GPU combinations: an APU can only be
+    // that, so only check the first GPU agent.
+    hsa_agent_t GPUAgent = KernelAgents[0];
+    std::memset((void *)&GfxName, 0, sizeof(char) * 64);
+
+    hsa_status_t Status = hsa_agent_get_info(
+        GPUAgent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, GfxName);
+    if (Status != HSA_STATUS_SUCCESS)
+      return;
+
+    std::string StrGfxName(GfxName);
+    std::transform(std::begin(StrGfxName), std::end(StrGfxName),
+                   std::begin(StrGfxName),
+                   [](char c) { return std::tolower(c); });
+
+    if (StrGfxName == "gfx940") {
+      IsAPU = true;
+      return;
+    } else if (StrGfxName == "gfx942") {
+      // can be MI300A or MI300X
+      uint32_t ChipID = 0;
+      Status = hsa_agent_get_info(
+          GPUAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID, &ChipID);
+
+      if (Status != HSA_STATUS_SUCCESS)
+        return;
+
+      if (!(ChipID & 0x1)) {
+        IsAPU = true;
+        return;
+      }
+    }
+  }
+
   /// Indicate whether the HSA runtime was correctly initialized. Even if there
   /// is no available devices this boolean will be true. It indicates whether
   /// we can safely call HSA functions (e.g., hsa_shut_down).
@@ -3122,6 +3180,12 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
 
   /// The device representing all HSA host agents.
   AMDHostDeviceTy *HostDevice;
+
+  /// Is the plugin associated with an APU?
+  bool IsAPU;
+
+  /// Value of HSA_XNACK environment variable.
+  BoolEnvar OMPX_HSAXnack;
 };
 
 Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
index 289dbf8e3d09d1..b4f0a193992bc9 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -88,6 +88,34 @@ StringRef parseTargetID(StringRef TargetID, StringMap<bool> &FeatureMap) {
   return Arch;
 }
 
+bool IsXnackEnabledViaKernelParam() {
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrError =
+      MemoryBuffer::getFileAsStream("/proc/cmdline");
+
+  if (std::error_code ErrorCode = FileOrError.getError()) {
+    FAILURE_MESSAGE("Cannot open /proc/cmdline : %s\n",
+                    ErrorCode.message().c_str());
+    return false;
+  }
+
+  StringRef FileContent = (FileOrError.get())->getBuffer();
+
+  StringRef RefString("amdgpu.noretry=");
+  int SizeOfRefString = RefString.size();
+
+  size_t Pos = FileContent.find_insensitive(RefString);
+  // Is noretry defined?
+  if (Pos != StringRef::npos) {
+    bool NoRetryValue = FileContent[Pos + SizeOfRefString] - '0';
+    // is noretry set to 0
+    if (!NoRetryValue)
+      return true;
+  }
+
+  return false;
+}
+
 /// Check if an image is compatible with current system's environment.
 bool isImageCompatibleWithEnv(const __tgt_image_info *Info,
                               StringRef EnvTargetID) {
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
index 28484ae4d5f5ea..7f3fb6d939d5e5 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -1073,6 +1073,10 @@ struct GenericPluginTy {
   /// Indicate whether the plugin supports empty images.
   virtual bool supportsEmptyImages() const { return false; }
 
+  /// Returns true if current plugin architecture is an APU
+  /// and unified_shared_memory was not requested by the program.
+  virtual bool useAutoZeroCopy() { return false; }
+
 protected:
   /// Indicate whether a device id is valid.
   bool isValidDeviceId(int32_t DeviceId) const {
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 1c9777dba7a9aa..8465502ae4e519 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -2062,6 +2062,9 @@ int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset) {
   return OFFLOAD_SUCCESS;
 }
 
+int32_t __tgt_rtl_use_auto_zero_copy() {
+  return Plugin::get().useAutoZeroCopy();
+}
 #ifdef __cplusplus
 }
 #endif
diff --git a/openmp/libomptarget/src/OpenMP/Mapping.cpp b/openmp/libomptarget/src/OpenMP/Mapping.cpp
index a5c24810e0af95..755394699272cf 100644
--- a/openmp/libomptarget/src/OpenMP/Mapping.cpp
+++ b/openmp/libomptarget/src/OpenMP/Mapping.cpp
@@ -252,8 +252,9 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
       MESSAGE("device mapping required by 'present' map type modifier does not "
               "exist for host address " DPxMOD " (%" PRId64 " bytes)",
               DPxPTR(HstPtrBegin), Size);
-  } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-             !HasCloseModifier) {
+  } else if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+              !HasCloseModifier) ||
+             PM->getUseAutoZeroCopy()) {
     // If unified shared memory is active, implicitly mapped variables that are
     // not privatized use host address. Any explicitly mapped variables also use
     // host address where correctness is not impeded. In all other cases maps
@@ -415,7 +416,8 @@ TargetPointerResultTy MappingInfoTy::getTgtPtrBegin(
          LR.TPR.getEntry()->dynRefCountToStr().c_str(), DynRefCountAction,
          LR.TPR.getEntry()->holdRefCountToStr().c_str(), HoldRefCountAction);
     LR.TPR.TargetPointer = (void *)TP;
-  } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+  } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY ||
+             PM->getUseAutoZeroCopy()) {
     // If the value isn't found in the mapping and unified shared memory
     // is on then it means we have stumbled upon a value which we need to
     // use directly from the host.
diff --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp
index 34a0d1dcefa526..b36e9dceb0ce86 100644
--- a/openmp/libomptarget/src/PluginManager.cpp
+++ b/openmp/libomptarget/src/PluginManager.cpp
@@ -146,6 +146,7 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) {
   ExclusiveDevicesAccessor->reserve(DeviceOffset + NumPD);
   for (int32_t PDevI = 0, UserDevId = DeviceOffset; PDevI < NumPD; PDevI++) {
     auto Device = std::make_unique<DeviceTy>(this, UserDevId, PDevI);
+
     if (auto Err = Device->init()) {
       DP("Skip plugin known device %d: %s\n", PDevI,
          toString(std::move(Err)).c_str());
@@ -157,6 +158,10 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) {
     ++UserDevId;
   }
 
+  // Only support APU architectures without externally attached GPUs.
+  if (use_auto_zero_copy)
+    PM.setUseAutoZeroCopy(use_auto_zero_copy());
+
   DP("Plugin adaptor " DPxMOD " has index %d, exposes %d out of %d devices!\n",
      DPxPTR(LibraryHandler.get()), DeviceOffset, NumberOfUserDevices,
      NumberOfPluginDevices);
diff --git a/openmp/libomptarget/test/mapping/auto_zero_copy.cpp b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
new file mode 100644
index 00000000000000..39c48e1d0f4856
--- /dev/null
+++ b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
@@ -0,0 +1,43 @@
+
+// RUN: %libomptarget-compilexx-generic
+// RUN: env HSA_XNACK=1 LIBOMPTARGET_DEBUG=1 %libomptarget-run-generic 2>&1 \
+// RUN: | %fcheck-generic -check-prefix=DEBUG -check-prefix=CHECK
+// REQUIRES: libomptarget-debug
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cstdio>
+
+int main() {
+  int n = 1024;
+
+  // test various mapping types
+  int *a = new int[n];
+  int k = 3;
+  int b[n];
+
+  for (int i = 0; i < n; i++)
+    b[i] = i;
+
+// DEBUG: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
+// DEBUG: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
+#pragma omp target teams distribute parallel for map(tofrom : a[ : n])         \
+    map(to : b[ : n])
+  for (int i = 0; i < n; i++)
+    a[i] = i + b[i] + k;
+
+  int err = 0;
+  for (int i = 0; i < n; i++)
+    if (a[i] != i + b[i] + k)
+      err++;
+
+  // CHECK: PASS
+  if (err == 0)
+    printf("PASS\n");
+  return err;
+}



More information about the Openmp-commits mailing list