[llvm] [AMDGPU][Offload] Enable memory manager use for up to ~3GB allocation size in omp_target_alloc (PR #151882)

Sun Aug 3 11:59:21 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: None (hidekisaito)

<details>
<summary>Changes</summary>



---
Full diff: https://github.com/llvm/llvm-project/pull/151882.diff


6 Files Affected:

- (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+35) 
- (modified) offload/plugins-nextgen/common/include/PluginInterface.h (+3) 
- (modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (+4-1) 
- (modified) offload/test/lit.cfg (+8-3) 
- (modified) offload/test/sanitizer/use_after_free_2.c (+3) 
- (added) offload/test/sanitizer/use_after_free_3.c (+35) 


``````````diff

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index f8db9bf0ae739..94e635d94ed3b 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2944,6 +2944,41 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::success();
   }
 
+  bool checkIfCoarseGrainMemoryNearOrAbove64GB() {
+    for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
+      if (Pool->isGlobal() && Pool->isCoarseGrained()) {
+        uint64_t Value;
+        hsa_status_t Status =
+            Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
+        if (Status != HSA_STATUS_SUCCESS)
+          continue;
+        constexpr uint64_t Almost64Gig = 0xFF0000000;
+        if (Value >= Almost64Gig)
+          return true;
+      }
+    }
+    return false; // CoarseGrain pool w/ 64GB or more capacity not found
+  }
+
+  size_t getMemoryManagerSizeThreshold() override {
+    // Targeting high memory capacity GPUs such as
+    // MI210 or later data center GPUs.
+    if (checkIfCoarseGrainMemoryNearOrAbove64GB()) {
+      // Set GenericDeviceTy::MemoryManager's Threshold to ~3GB,
+      // if threshold is not already set by ENV var
+      // LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD.
+      // This MemoryManager is used for omp_target_alloc(), OpenMP
+      // (non-usm) map clause, etc.
+      //
+      // Ideally, this kind of pooling is best performed at
+      // a common level (e.g, user side of HSA) between OpenMP and HIP
+      // but that feature does not exist (yet).
+      constexpr size_t Almost3Gig = 3000000000u;
+      return Almost3Gig;
+    }
+    return 0;
+  }
+
   /// Envar for controlling the number of HSA queues per device. High number of
   /// queues may degrade performance.
   UInt32Envar OMPX_NumQueues;
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 8c17a2ee07047..87c3777b0eda7 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1090,6 +1090,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Pointer to the memory manager or nullptr if not available.
   MemoryManagerTy *MemoryManager;
 
+  /// Per device setting of MemoryManager's Threshold
+  virtual size_t getMemoryManagerSizeThreshold() { return 0 /* use default */; }
+
   /// Environment variables defined by the OpenMP standard.
   Int32Envar OMP_TeamLimit;
   Int32Envar OMP_NumTeams;
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 94a050b559efe..46151cc9abce6 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -815,8 +815,11 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
 
   // Enable the memory manager if required.
   auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
-  if (EnableMM)
+  if (EnableMM) {
+    if (ThresholdMM == 0)
+      ThresholdMM = getMemoryManagerSizeThreshold();
     MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
+  }
 
   return Plugin::success();
 }
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 800a63bc0ee32..f3e8e9a66685e 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -121,6 +121,7 @@ if config.libomptarget_test_pgo:
 # For all other targets, we currently assume it is.
 supports_unified_shared_memory = True
 supports_apu = False
+supports_large_allocation_memory_pool = False
 if config.libomptarget_current_target.startswith('nvptx'):
   try:
     cuda_arch = int(config.cuda_test_arch[:3])
@@ -132,9 +133,11 @@ if config.libomptarget_current_target.startswith('nvptx'):
 elif config.libomptarget_current_target.startswith('amdgcn'):
     # amdgpu_test_arch contains a list of AMD GPUs in the system
     # only check the first one assuming that we will run the test on it.
-    if not (config.amdgpu_test_arch.startswith("gfx90a") or
-            config.amdgpu_test_arch.startswith("gfx942") or
-            config.amdgpu_test_arch.startswith("gfx950")):
+    if (config.amdgpu_test_arch.startswith("gfx90a") or
+        config.amdgpu_test_arch.startswith("gfx942") or
+        config.amdgpu_test_arch.startswith("gfx950")):
+       supports_large_allocation_memory_pool = True
+    else:
        supports_unified_shared_memory = False
     # check if AMD architecture is an APU:
     if ((config.amdgpu_test_arch.startswith("gfx942") and
@@ -144,6 +147,8 @@ if supports_unified_shared_memory:
    config.available_features.add('unified_shared_memory')
 if supports_apu:
    config.available_features.add('apu')
+if supports_large_allocation_memory_pool:
+   config.available_features.add('large_allocation_memory_pool')
 
 # Setup environment to find dynamic library at runtime
 if config.operating_system == 'Windows':
diff --git a/offload/test/sanitizer/use_after_free_2.c b/offload/test/sanitizer/use_after_free_2.c
index 02aa453d0a975..1c1e09744a750 100644
--- a/offload/test/sanitizer/use_after_free_2.c
+++ b/offload/test/sanitizer/use_after_free_2.c
@@ -10,6 +10,9 @@
 // UNSUPPORTED: s390x-ibm-linux-gnu
 // UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
+// If offload memory pooling is enabled for a large allocation, reuse error is
+// not detected. UNSUPPORTED: large_allocation_memory_pool
+
 #include <omp.h>
 
 int main() {
diff --git a/offload/test/sanitizer/use_after_free_3.c b/offload/test/sanitizer/use_after_free_3.c
new file mode 100644
index 0000000000000..fd77cff0d5c81
--- /dev/null
+++ b/offload/test/sanitizer/use_after_free_3.c
@@ -0,0 +1,35 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=1024 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK-PASS
+// clang-format on
+
+// If offload memory pooling is enabled for a large allocation, reuse error is
+// not detected. Run the test w/ and w/o ENV var override on memory pooling
+// threshold. REQUIRES: large_allocation_memory_pool
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int N = (1 << 30);
+  char *A = (char *)malloc(N);
+  char *P;
+#pragma omp target map(A[:N]) map(from : P)
+  {
+    P = &A[N / 2];
+    *P = 3;
+  }
+  // clang-format off
+// CHECK: OFFLOAD ERROR: memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
+// CHECK: Device pointer [[PTR]] points into prior host-issued allocation:
+// CHECK: Last deallocation:
+// CHECK: Last allocation of size 1073741824
+// clang-format on
+#pragma omp target
+  { *P = 5; }
+
+  // CHECK-PASS: PASS
+  printf("PASS\n");
+  return 0;
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/151882