[Openmp-commits] [openmp] [OpenMP][libomptarget] Enable automatic unified shared memory executi… (PR #75999)
via Openmp-commits
openmp-commits at lists.llvm.org
Tue Dec 19 19:19:46 PST 2023
https://github.com/carlobertolli created https://github.com/llvm/llvm-project/pull/75999
…on (zero-copy) on MI300A.
This patch enables applications that did not request OpenMP unified_shared_memory to run with the same zero-copy behavior, where mapped memory does not result in extra memory allocations and memory copies, but CPU-allocated memory is accessed from the device. The name for this behavior is "automatic zero-copy" and it relies on detecting: that the runtime is running on a MI300A, that the user did not select unified_shared_memory in their program, and that XNACK (unified memory support) is enabled in the current GPU configuration. If all these conditions are met, then automatic zero-copy is triggered.
This patch is still missing support for global variables, which will be provided in a subsequent patch.
>From 15235fb8c75d79712a844326f179a3ca02a73ebd Mon Sep 17 00:00:00 2001
From: Carlo Bertolli <carlo.bertolli at amd.com>
Date: Tue, 19 Dec 2023 15:38:40 -0600
Subject: [PATCH] [OpenMP][libomptarget] Enable automatic unified shared memory
execution (zero-copy) on MI300A.
This patch enables applications that did not request OpenMP unified_shared_memory to run with the same zero-copy behavior, where mapped memory does not result in extra memory allocations and memory copies, but CPU-allocated memory is accessed from the device.
The name for this behavior is "automatic zero-copy" and it relies on detecting: that the runtime is running on a MI300A, that the user did not select unified_shared_memory in their program, and that XNACK (unified memory support) is enabled in the current
GPU configuration. If all these conditions are met, then automatic zero-copy is triggered.
This patch is still missing support for global variables, which will be provided in a subsequent patch.
Co-authored-by: Thorsten Blass <thorsten.blass at amd.com>
Co-authored-by: Carlo Bertolli <carlo.bertolli at amd.com>
---
openmp/libomptarget/include/PluginManager.h | 12 ++++
.../libomptarget/include/Shared/PluginAPI.h | 1 +
.../libomptarget/include/Shared/PluginAPI.inc | 1 +
.../plugins-nextgen/amdgpu/src/rtl.cpp | 66 ++++++++++++++++++-
.../amdgpu/utils/UtilitiesRTL.h | 28 ++++++++
.../common/include/PluginInterface.h | 4 ++
.../common/src/PluginInterface.cpp | 3 +
openmp/libomptarget/src/OpenMP/Mapping.cpp | 8 ++-
openmp/libomptarget/src/PluginManager.cpp | 5 ++
.../test/mapping/auto_zero_copy.cpp | 43 ++++++++++++
10 files changed, 167 insertions(+), 4 deletions(-)
create mode 100644 openmp/libomptarget/test/mapping/auto_zero_copy.cpp
diff --git a/openmp/libomptarget/include/PluginManager.h b/openmp/libomptarget/include/PluginManager.h
index a0499c37504c0d..a1bc98aa5d7a1f 100644
--- a/openmp/libomptarget/include/PluginManager.h
+++ b/openmp/libomptarget/include/PluginManager.h
@@ -190,6 +190,11 @@ struct PluginManager {
/// Add \p Flags to the user provided requirements.
void addRequirements(int64_t Flags) { Requirements.addRequirements(Flags); }
+ bool getUseAutoZeroCopy() const { return UseAutoZeroCopy; }
+ void setUseAutoZeroCopy(bool DetectedUseAutoZeroCopy) {
+ UseAutoZeroCopy = DetectedUseAutoZeroCopy;
+ }
+
private:
bool RTLsLoaded = false;
llvm::SmallVector<__tgt_bin_desc *> DelayedBinDesc;
@@ -208,6 +213,13 @@ struct PluginManager {
/// Devices associated with plugins, accesses to the container are exclusive.
ProtectedObj<DeviceContainerTy> Devices;
+
+ /// Whe running on an APU, the GPU plugin may decide to
+ /// run in zero-copy even though the user did not program
+ /// their application with unified_shared_memory requirement.
+ /// This execution modality is called auto zero-copy and it is
+ /// cached information from the Plugin.
+ bool UseAutoZeroCopy = false;
};
extern PluginManager *PM;
diff --git a/openmp/libomptarget/include/Shared/PluginAPI.h b/openmp/libomptarget/include/Shared/PluginAPI.h
index 41d1908da21532..a8c3f337036d3d 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.h
+++ b/openmp/libomptarget/include/Shared/PluginAPI.h
@@ -225,6 +225,7 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
void *VAddr, bool isRecord,
bool SaveOutput,
uint64_t &ReqPtrArgOffset);
+int32_t __tgt_rtl_use_auto_zero_copy();
}
#endif // OMPTARGET_SHARED_PLUGIN_API_H
diff --git a/openmp/libomptarget/include/Shared/PluginAPI.inc b/openmp/libomptarget/include/Shared/PluginAPI.inc
index 0949e4e593ddeb..575158297c89b6 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.inc
+++ b/openmp/libomptarget/include/Shared/PluginAPI.inc
@@ -48,3 +48,4 @@ PLUGIN_API_HANDLE(data_notify_mapped, false);
PLUGIN_API_HANDLE(data_notify_unmapped, false);
PLUGIN_API_HANDLE(set_device_offset, false);
PLUGIN_API_HANDLE(initialize_record_replay, false);
+PLUGIN_API_HANDLE(use_auto_zero_copy, false);
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 0ffdabe5bcd420..5636643fb111b6 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2913,7 +2913,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
/// Create an AMDGPU plugin and initialize the AMDGPU driver.
AMDGPUPluginTy()
: GenericPluginTy(getTripleArch()), Initialized(false),
- HostDevice(nullptr) {}
+ HostDevice(nullptr), IsAPU(false), OMPX_HSAXnack("HSA_XNACK", false) {}
/// This class should not be copied.
AMDGPUPluginTy(const AMDGPUPluginTy &) = delete;
@@ -2990,6 +2990,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
if (auto Err = HostDevice->init())
return std::move(Err);
+ isAPU();
+
return NumDevices;
}
@@ -3065,6 +3067,21 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
return KernelAgents;
}
+ /// Returns true if auto zero-copy the best configuration for the current arch
+ /// and binary.
+ bool useAutoZeroCopy() override final {
+ // Automatic zero-copy only applies to programs that did
+ // not request unified_shared_memory and are deployed on an
+ // APU with XNACK enabled.
+ if (getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY)
+ return false;
+
+ // XNACK can be enabled with with kernel boot parameter or with
+ // environment variable.
+ return (IsAPU &&
+ (utils::IsXnackEnabledViaKernelParam() || OMPX_HSAXnack.get()));
+ }
+
private:
/// Event handler that will be called by ROCr if an event is detected.
static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
@@ -3109,6 +3126,47 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
return HSA_STATUS_ERROR;
}
+ /// Detect if current architecture is an APU.
+ void isAPU() {
+ char GfxName[64];
+
+ if (!KernelAgents.size())
+ return;
+
+ // Do not allow for mixed APU+discrete GPU combinations: an APU can only be
+ // that, so only check the first GPU agent.
+ hsa_agent_t GPUAgent = KernelAgents[0];
+ std::memset((void *)&GfxName, 0, sizeof(char) * 64);
+
+ hsa_status_t Status = hsa_agent_get_info(
+ GPUAgent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, GfxName);
+ if (Status != HSA_STATUS_SUCCESS)
+ return;
+
+ std::string StrGfxName(GfxName);
+ std::transform(std::begin(StrGfxName), std::end(StrGfxName),
+ std::begin(StrGfxName),
+ [](char c) { return std::tolower(c); });
+
+ if (StrGfxName == "gfx940") {
+ IsAPU = true;
+ return;
+ } else if (StrGfxName == "gfx942") {
+ // can be MI300A or MI300X
+ uint32_t ChipID = 0;
+ Status = hsa_agent_get_info(
+ GPUAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID, &ChipID);
+
+ if (Status != HSA_STATUS_SUCCESS)
+ return;
+
+ if (!(ChipID & 0x1)) {
+ IsAPU = true;
+ return;
+ }
+ }
+ }
+
/// Indicate whether the HSA runtime was correctly initialized. Even if there
/// is no available devices this boolean will be true. It indicates whether
/// we can safely call HSA functions (e.g., hsa_shut_down).
@@ -3122,6 +3180,12 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
/// The device representing all HSA host agents.
AMDHostDeviceTy *HostDevice;
+
+ /// Is the plugin associated with an APU?
+ bool IsAPU;
+
+ /// Value of HSA_XNACK environment variable.
+ BoolEnvar OMPX_HSAXnack;
};
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
index 289dbf8e3d09d1..b4f0a193992bc9 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -88,6 +88,34 @@ StringRef parseTargetID(StringRef TargetID, StringMap<bool> &FeatureMap) {
return Arch;
}
+bool IsXnackEnabledViaKernelParam() {
+
+ ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrError =
+ MemoryBuffer::getFileAsStream("/proc/cmdline");
+
+ if (std::error_code ErrorCode = FileOrError.getError()) {
+ FAILURE_MESSAGE("Cannot open /proc/cmdline : %s\n",
+ ErrorCode.message().c_str());
+ return false;
+ }
+
+ StringRef FileContent = (FileOrError.get())->getBuffer();
+
+ StringRef RefString("amdgpu.noretry=");
+ int SizeOfRefString = RefString.size();
+
+ size_t Pos = FileContent.find_insensitive(RefString);
+ // Is noretry defined?
+ if (Pos != StringRef::npos) {
+ bool NoRetryValue = FileContent[Pos + SizeOfRefString] - '0';
+ // is noretry set to 0
+ if (!NoRetryValue)
+ return true;
+ }
+
+ return false;
+}
+
/// Check if an image is compatible with current system's environment.
bool isImageCompatibleWithEnv(const __tgt_image_info *Info,
StringRef EnvTargetID) {
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
index 28484ae4d5f5ea..7f3fb6d939d5e5 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -1073,6 +1073,10 @@ struct GenericPluginTy {
/// Indicate whether the plugin supports empty images.
virtual bool supportsEmptyImages() const { return false; }
+ /// Returns true if current plugin architecture is an APU
+ /// and unified_shared_memory was not requested by the program.
+ virtual bool useAutoZeroCopy() { return false; }
+
protected:
/// Indicate whether a device id is valid.
bool isValidDeviceId(int32_t DeviceId) const {
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 1c9777dba7a9aa..8465502ae4e519 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -2062,6 +2062,9 @@ int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset) {
return OFFLOAD_SUCCESS;
}
+int32_t __tgt_rtl_use_auto_zero_copy() {
+ return Plugin::get().useAutoZeroCopy();
+}
#ifdef __cplusplus
}
#endif
diff --git a/openmp/libomptarget/src/OpenMP/Mapping.cpp b/openmp/libomptarget/src/OpenMP/Mapping.cpp
index a5c24810e0af95..755394699272cf 100644
--- a/openmp/libomptarget/src/OpenMP/Mapping.cpp
+++ b/openmp/libomptarget/src/OpenMP/Mapping.cpp
@@ -252,8 +252,9 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
MESSAGE("device mapping required by 'present' map type modifier does not "
"exist for host address " DPxMOD " (%" PRId64 " bytes)",
DPxPTR(HstPtrBegin), Size);
- } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
- !HasCloseModifier) {
+ } else if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+ !HasCloseModifier) ||
+ PM->getUseAutoZeroCopy()) {
// If unified shared memory is active, implicitly mapped variables that are
// not privatized use host address. Any explicitly mapped variables also use
// host address where correctness is not impeded. In all other cases maps
@@ -415,7 +416,8 @@ TargetPointerResultTy MappingInfoTy::getTgtPtrBegin(
LR.TPR.getEntry()->dynRefCountToStr().c_str(), DynRefCountAction,
LR.TPR.getEntry()->holdRefCountToStr().c_str(), HoldRefCountAction);
LR.TPR.TargetPointer = (void *)TP;
- } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+ } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY ||
+ PM->getUseAutoZeroCopy()) {
// If the value isn't found in the mapping and unified shared memory
// is on then it means we have stumbled upon a value which we need to
// use directly from the host.
diff --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp
index 34a0d1dcefa526..b36e9dceb0ce86 100644
--- a/openmp/libomptarget/src/PluginManager.cpp
+++ b/openmp/libomptarget/src/PluginManager.cpp
@@ -146,6 +146,7 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) {
ExclusiveDevicesAccessor->reserve(DeviceOffset + NumPD);
for (int32_t PDevI = 0, UserDevId = DeviceOffset; PDevI < NumPD; PDevI++) {
auto Device = std::make_unique<DeviceTy>(this, UserDevId, PDevI);
+
if (auto Err = Device->init()) {
DP("Skip plugin known device %d: %s\n", PDevI,
toString(std::move(Err)).c_str());
@@ -157,6 +158,10 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) {
++UserDevId;
}
+ // Only support APU architectures without externally attached GPUs.
+ if (use_auto_zero_copy)
+ PM.setUseAutoZeroCopy(use_auto_zero_copy());
+
DP("Plugin adaptor " DPxMOD " has index %d, exposes %d out of %d devices!\n",
DPxPTR(LibraryHandler.get()), DeviceOffset, NumberOfUserDevices,
NumberOfPluginDevices);
diff --git a/openmp/libomptarget/test/mapping/auto_zero_copy.cpp b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
new file mode 100644
index 00000000000000..39c48e1d0f4856
--- /dev/null
+++ b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
@@ -0,0 +1,43 @@
+
+// RUN: %libomptarget-compilexx-generic
+// RUN: env HSA_XNACK=1 LIBOMPTARGET_DEBUG=1 %libomptarget-run-generic 2>&1 \
+// RUN: | %fcheck-generic -check-prefix=DEBUG -check-prefix=CHECK
+// REQUIRES: libomptarget-debug
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cstdio>
+
+int main() {
+ int n = 1024;
+
+ // test various mapping types
+ int *a = new int[n];
+ int k = 3;
+ int b[n];
+
+ for (int i = 0; i < n; i++)
+ b[i] = i;
+
+// DEBUG: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
+// DEBUG: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
+#pragma omp target teams distribute parallel for map(tofrom : a[ : n]) \
+ map(to : b[ : n])
+ for (int i = 0; i < n; i++)
+ a[i] = i + b[i] + k;
+
+ int err = 0;
+ for (int i = 0; i < n; i++)
+ if (a[i] != i + b[i] + k)
+ err++;
+
+ // CHECK: PASS
+ if (err == 0)
+ printf("PASS\n");
+ return err;
+}
More information about the Openmp-commits
mailing list