[Openmp-commits] [openmp] [OpenMP] Enable automatic unified shared memory on MI300A. (PR #77512)
via Openmp-commits
openmp-commits at lists.llvm.org
Tue Jan 9 11:08:50 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: None (carlobertolli)
<details>
<summary>Changes</summary>
This patch enables applications that did not request OpenMP unified_shared_memory to run with the same zero-copy behavior, where mapped memory does not result in extra memory allocations and memory copies, but CPU-allocated memory is accessed from the device. The name for this behavior is "automatic zero-copy" and it relies on detecting: that the runtime is running on a MI300A, that the user did not select unified_shared_memory in their program, and that XNACK (unified memory support) is enabled in the current GPU configuration. If all these conditions are met, then automatic zero-copy is triggered.
This patch also introduces an environment variable OMPX_APU_MAPS that, if set, triggers automatic zero-copy also on non APU GPUs (e.g., on discrete GPUs).
This patch is still missing support for global variables, which will be provided in a subsequent patch.
---
Full diff: https://github.com/llvm/llvm-project/pull/77512.diff
13 Files Affected:
- (modified) openmp/libomptarget/include/Shared/PluginAPI.h (+3)
- (modified) openmp/libomptarget/include/Shared/PluginAPI.inc (+1)
- (modified) openmp/libomptarget/include/Shared/Requirements.h (+14-1)
- (modified) openmp/libomptarget/include/device.h (+3)
- (modified) openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h (+1)
- (modified) openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp (+49)
- (modified) openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h (+28)
- (modified) openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h (+5)
- (modified) openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp (+10)
- (modified) openmp/libomptarget/src/OpenMP/Mapping.cpp (+10-3)
- (modified) openmp/libomptarget/src/PluginManager.cpp (+13)
- (modified) openmp/libomptarget/src/device.cpp (+6)
- (added) openmp/libomptarget/test/mapping/auto_zero_copy.cpp (+55)
``````````diff
diff --git a/openmp/libomptarget/include/Shared/PluginAPI.h b/openmp/libomptarget/include/Shared/PluginAPI.h
index c6aacf4ce2124b..aece53d7ee1caa 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.h
+++ b/openmp/libomptarget/include/Shared/PluginAPI.h
@@ -219,6 +219,9 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
void *VAddr, bool isRecord,
bool SaveOutput,
uint64_t &ReqPtrArgOffset);
+
+// Returns true if the device \p DeviceId suggests to use auto zero-copy.
+int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId);
}
#endif // OMPTARGET_SHARED_PLUGIN_API_H
diff --git a/openmp/libomptarget/include/Shared/PluginAPI.inc b/openmp/libomptarget/include/Shared/PluginAPI.inc
index 25ebe7d437f9d1..b842c6eef1d4fc 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.inc
+++ b/openmp/libomptarget/include/Shared/PluginAPI.inc
@@ -47,3 +47,4 @@ PLUGIN_API_HANDLE(data_notify_mapped, false);
PLUGIN_API_HANDLE(data_notify_unmapped, false);
PLUGIN_API_HANDLE(set_device_offset, false);
PLUGIN_API_HANDLE(initialize_record_replay, false);
+PLUGIN_API_HANDLE(use_auto_zero_copy, false);
diff --git a/openmp/libomptarget/include/Shared/Requirements.h b/openmp/libomptarget/include/Shared/Requirements.h
index 19d6b8ffca495f..b16a1650f0c403 100644
--- a/openmp/libomptarget/include/Shared/Requirements.h
+++ b/openmp/libomptarget/include/Shared/Requirements.h
@@ -33,7 +33,12 @@ enum OpenMPOffloadingRequiresDirFlags : int64_t {
/// unified_shared_memory clause.
OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
/// dynamic_allocators clause.
- OMP_REQ_DYNAMIC_ALLOCATORS = 0x010
+ OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
+ /// Auto zero-copy extension:
+ /// when running on an APU, the GPU plugin may decide to
+ /// run in zero-copy even though the user did not program
+ /// their application with unified_shared_memory requirement.
+ OMPX_REQ_AUTO_ZERO_COPY = 0x020
};
class RequirementCollection {
@@ -65,6 +70,14 @@ class RequirementCollection {
return;
}
+ // Auto zero-copy is only valid when no other requirement has been set
+ // and it is computed at device initialization time, after the requirement
+ // flag has already been set to OMP_REQ_NONE.
+ if (SetFlags == OMP_REQ_NONE && NewFlags == OMPX_REQ_AUTO_ZERO_COPY) {
+ SetFlags = NewFlags;
+ return;
+ }
+
// If multiple compilation units are present enforce
// consistency across all of them for require clauses:
// - reverse_offload
diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h
index e94f48891dab36..3023fba6cc64db 100644
--- a/openmp/libomptarget/include/device.h
+++ b/openmp/libomptarget/include/device.h
@@ -164,6 +164,9 @@ struct DeviceTy {
/// Print all offload entries to stderr.
void dumpOffloadEntries();
+ /// Ask the device whether the runtime should use auto zero-copy.
+ bool useAutoZeroCopy();
+
private:
/// Deinitialize the device (and plugin).
void deinit();
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 9c59d3bf824de3..3117763e35896d 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -63,6 +63,7 @@ typedef enum {
} hsa_amd_memory_pool_access_t;
typedef enum hsa_amd_agent_info_s {
+ HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000,
HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001,
HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002,
HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003,
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index b67642e9e1bcb3..f72b741cfd675a 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1848,6 +1848,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
OMPX_UseMultipleSdmaEngines(
"LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
+ HSAXnackEnv("HSA_XNACK", false), OMPX_ApuMaps("OMPX_APU_MAPS", false),
AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
@@ -1940,6 +1941,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
return Err;
+ // detect if device is an APU.
+ if (auto Err = checkIfAPU())
+ return Err;
+
return Plugin::success();
}
@@ -2631,6 +2636,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
+ /// Returns true if auto zero-copy the best configuration for the current
+ /// arch.
+ bool useAutoZeroCopyImpl() override {
+ // XNACK can be enabled with with kernel boot parameter or with
+ // environment variable. Automatic zero-copy is used on APUs
+ // and on dGPUs when OMPX_APU_MAPS is set to true.
+ return ((IsAPU || OMPX_ApuMaps) &&
+ (HSAXnackEnv || utils::isXnackEnabledViaKernelParam()));
+ }
+
/// Getters and setters for stack and heap sizes.
Error getDeviceStackSize(uint64_t &Value) override {
Value = StackSize;
@@ -2728,6 +2743,30 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Err;
}
+ /// Detect if current architecture is an APU.
+ Error checkIfAPU() {
+ std::string StrGfxName(ComputeUnitKind);
+ std::transform(std::begin(StrGfxName), std::end(StrGfxName),
+ std::begin(StrGfxName),
+ [](char c) { return std::tolower(c); });
+ if (StrGfxName == "gfx940") {
+ IsAPU = true;
+ return Plugin::success();
+ }
+ if (StrGfxName == "gfx942") {
+ // can be MI300A or MI300X
+ uint32_t ChipID = 0;
+ if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID))
+ return Err;
+
+ if (!(ChipID & 0x1)) {
+ IsAPU = true;
+ return Plugin::success();
+ }
+ }
+ return Plugin::success();
+ }
+
/// Envar for controlling the number of HSA queues per device. High number of
/// queues may degrade performance.
UInt32Envar OMPX_NumQueues;
@@ -2764,6 +2803,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// Use ROCm 5.7 interface for multiple SDMA engines
BoolEnvar OMPX_UseMultipleSdmaEngines;
+ /// Value of HSA_XNACK environment variable.
+ BoolEnvar HSAXnackEnv;
+
+ /// Value of OMPX_APU_MAPS env var used to force
+ /// automatic zero-copy behavior on non-APU GPUs.
+ BoolEnvar OMPX_ApuMaps;
+
/// Stream manager for AMDGPU streams.
AMDGPUStreamManagerTy AMDGPUStreamManager;
@@ -2794,6 +2840,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// The current size of the stack that will be used in cases where it could
/// not be statically determined.
uint64_t StackSize = 16 * 1024 /* 16 KB */;
+
+ /// Is the plugin associated with an APU?
+ bool IsAPU{false};
};
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
index 58a3b5df00fac6..c5a58f82441458 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -116,6 +116,34 @@ inline bool isImageCompatibleWithEnv(StringRef ImageArch, uint32_t ImageFlags,
return true;
}
+inline bool isXnackEnabledViaKernelParam() {
+
+ ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrError =
+ MemoryBuffer::getFileAsStream("/proc/cmdline");
+
+ if (std::error_code ErrorCode = FileOrError.getError()) {
+ FAILURE_MESSAGE("Cannot open /proc/cmdline : %s\n",
+ ErrorCode.message().c_str());
+ return false;
+ }
+
+ StringRef FileContent = (FileOrError.get())->getBuffer();
+
+ StringRef RefString("amdgpu.noretry=");
+ int SizeOfRefString = RefString.size();
+
+ size_t Pos = FileContent.find_insensitive(RefString);
+ // Is noretry defined?
+ if (Pos != StringRef::npos) {
+ bool NoRetryValue = FileContent[Pos + SizeOfRefString] - '0';
+ // is noretry set to 0
+ if (!NoRetryValue)
+ return true;
+ }
+
+ return false;
+}
+
struct KernelMetaDataTy {
uint64_t KernelObject;
uint32_t GroupSegmentList;
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
index b85dc146d86d2f..abe85f43c2e726 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -872,6 +872,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
virtual Error getDeviceStackSize(uint64_t &V) = 0;
+ /// Returns true if current plugin architecture is an APU
+ /// and unified_shared_memory was not requested by the program.
+ bool useAutoZeroCopy();
+ virtual bool useAutoZeroCopyImpl() { return false; }
+
private:
/// Register offload entry for global variable.
Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 9490e58fc669cd..e82c2f7bef14f0 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1561,6 +1561,8 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) {
return syncEventImpl(EventPtr);
}
+bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
+
Error GenericPluginTy::init() {
auto NumDevicesOrErr = initImpl();
if (!NumDevicesOrErr)
@@ -2073,6 +2075,14 @@ int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset) {
return OFFLOAD_SUCCESS;
}
+int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId) {
+ // Automatic zero-copy only applies to programs that did
+ // not request unified_shared_memory and are deployed on an
+ // APU with XNACK enabled.
+ if (Plugin::get().getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY)
+ return false;
+ return Plugin::get().getDevice(DeviceId).useAutoZeroCopy();
+}
#ifdef __cplusplus
}
#endif
diff --git a/openmp/libomptarget/src/OpenMP/Mapping.cpp b/openmp/libomptarget/src/OpenMP/Mapping.cpp
index a5c24810e0af95..d95513d068afe8 100644
--- a/openmp/libomptarget/src/OpenMP/Mapping.cpp
+++ b/openmp/libomptarget/src/OpenMP/Mapping.cpp
@@ -252,8 +252,10 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
MESSAGE("device mapping required by 'present' map type modifier does not "
"exist for host address " DPxMOD " (%" PRId64 " bytes)",
DPxPTR(HstPtrBegin), Size);
- } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
- !HasCloseModifier) {
+ } else if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+ !HasCloseModifier) ||
+ (PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) {
+
// If unified shared memory is active, implicitly mapped variables that are
// not privatized use host address. Any explicitly mapped variables also use
// host address where correctness is not impeded. In all other cases maps
@@ -261,6 +263,10 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
// In addition to the mapping rules above, the close map modifier forces the
// mapping of the variable to the device.
if (Size) {
+ INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
+ "Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
+ "memory\n",
+ DPxPTR((uintptr_t)HstPtrBegin), Size);
DP("Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
"memory\n",
DPxPTR((uintptr_t)HstPtrBegin), Size);
@@ -415,7 +421,8 @@ TargetPointerResultTy MappingInfoTy::getTgtPtrBegin(
LR.TPR.getEntry()->dynRefCountToStr().c_str(), DynRefCountAction,
LR.TPR.getEntry()->holdRefCountToStr().c_str(), HoldRefCountAction);
LR.TPR.TargetPointer = (void *)TP;
- } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+ } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY ||
+ PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY) {
// If the value isn't found in the mapping and unified shared memory
// is on then it means we have stumbled upon a value which we need to
// use directly from the host.
diff --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp
index 83bf65f0f0de56..b624b8cff8ab53 100644
--- a/openmp/libomptarget/src/PluginManager.cpp
+++ b/openmp/libomptarget/src/PluginManager.cpp
@@ -144,6 +144,11 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) {
int32_t NumPD = getNumberOfPluginDevices();
ExclusiveDevicesAccessor->reserve(DeviceOffset + NumPD);
+ // Auto zero-copy is a per-device property. We need to ensure
+ // that all devices are suggesting to use it.
+ bool UseAutoZeroCopy = true;
+ if (NumPD == 0)
+ UseAutoZeroCopy = false;
for (int32_t PDevI = 0, UserDevId = DeviceOffset; PDevI < NumPD; PDevI++) {
auto Device = std::make_unique<DeviceTy>(this, UserDevId, PDevI);
if (auto Err = Device->init()) {
@@ -151,12 +156,20 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) {
toString(std::move(Err)).c_str());
continue;
}
+ UseAutoZeroCopy = UseAutoZeroCopy && Device->useAutoZeroCopy();
ExclusiveDevicesAccessor->push_back(std::move(Device));
++NumberOfUserDevices;
++UserDevId;
}
+ // Auto Zero-Copy can only be currently triggered when the system is an
+ // homogeneous APU architecture without attached discrete GPUs.
+ // If all devices suggest to use it, change requirment flags to trigger
+ // zero-copy behavior when mapping memory.
+ if (UseAutoZeroCopy)
+ PM.addRequirements(OMPX_REQ_AUTO_ZERO_COPY);
+
DP("Plugin adaptor " DPxMOD " has index %d, exposes %d out of %d devices!\n",
DPxPTR(LibraryHandler.get()), DeviceOffset, NumberOfUserDevices,
NumberOfPluginDevices);
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index fa8932361a51c9..55f46e6bd6374e 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -339,3 +339,9 @@ void DeviceTy::dumpOffloadEntries() {
fprintf(stderr, " %11s: %s\n", Kind, It.second.getNameAsCStr());
}
}
+
+bool DeviceTy::useAutoZeroCopy() {
+ if (RTL->use_auto_zero_copy)
+ return RTL->use_auto_zero_copy(RTLDeviceID);
+ return false;
+}
diff --git a/openmp/libomptarget/test/mapping/auto_zero_copy.cpp b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
new file mode 100644
index 00000000000000..4201399d489599
--- /dev/null
+++ b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
@@ -0,0 +1,55 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic
+// RUN: env OMPX_APU_MAPS=1 HSA_XNACK=1 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
+// RUN: | %fcheck-generic -check-prefix=INFO_ZERO -check-prefix=CHECK
+
+// RUN: %libomptarget-compilexx-generic
+// RUN: env HSA_XNACK=0 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
+// RUN: | %fcheck-generic -check-prefix=INFO_COPY -check-prefix=CHECK
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+// clang-format on
+
+#include <cstdio>
+
+int main() {
+ int n = 1024;
+
+ // test various mapping types
+ int *a = new int[n];
+ int k = 3;
+ int b[n];
+
+ for (int i = 0; i < n; i++)
+ b[i] = i;
+
+ // clang-format off
+ // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
+ // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
+
+ // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
+ // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
+ // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
+ // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
+// clang-format on
+#pragma omp target teams distribute parallel for map(tofrom : a[ : n]) \
+ map(to : b[ : n])
+ for (int i = 0; i < n; i++)
+ a[i] = i + b[i] + k;
+
+ int err = 0;
+ for (int i = 0; i < n; i++)
+ if (a[i] != i + b[i] + k)
+ err++;
+
+ // CHECK: PASS
+ if (err == 0)
+ printf("PASS\n");
+ return err;
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/77512
More information about the Openmp-commits
mailing list