[Openmp-commits] [openmp] b215932 - [OpenMP][libomptarget][AMDGPU] lock/unlock (pin/unpin) mechanism in libomptarget amdgpu plugin (API and implementation)

Carlo Bertolli via Openmp-commits openmp-commits at lists.llvm.org
Fri Jan 13 10:19:09 PST 2023


Author: Carlo Bertolli
Date: 2023-01-13T12:18:49-06:00
New Revision: b215932e69912bd4ad800b695469252417a9e543

URL: https://github.com/llvm/llvm-project/commit/b215932e69912bd4ad800b695469252417a9e543
DIFF: https://github.com/llvm/llvm-project/commit/b215932e69912bd4ad800b695469252417a9e543.diff

LOG: [OpenMP][libomptarget][AMDGPU] lock/unlock (pin/unpin) mechanism in libomptarget amdgpu plugin (API and implementation)
The current only way to obtain pinned memory with libomptarget is to use a custom allocator llvm_omp_target_alloc_host.
This reflects well the CUDA implementation of libomptarget, but it does not correctly expose the AMDGPU runtime API,
where any system allocated page can be locked/unlocked through a call to hsa_amd_memory_lock/unlock.
This patch enables users to allocate memory through malloc (mmap, sbreak) and then pin the related memory pages
with a libomptarget special call. It is a base support in the amdgpu libomptarget plugin to enable users to prelock
their host memory pages so that the runtime doesn't need to lock them itself for asynchronous memory transfers.

Reviewed By: jdoerfert, ye-luo

Differential Revision: https://reviews.llvm.org/D139208

Added: 
    openmp/libomptarget/test/mapping/prelock.cpp

Modified: 
    openmp/libomptarget/include/omptargetplugin.h
    openmp/libomptarget/include/rtl.h
    openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
    openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
    openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
    openmp/libomptarget/src/api.cpp
    openmp/libomptarget/src/exports
    openmp/libomptarget/src/omptarget.cpp
    openmp/libomptarget/src/private.h
    openmp/libomptarget/src/rtl.cpp
    openmp/runtime/src/kmp_alloc.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h
index 50fa6e9f41e91..04f2a79e089dd 100644
--- a/openmp/libomptarget/include/omptargetplugin.h
+++ b/openmp/libomptarget/include/omptargetplugin.h
@@ -202,6 +202,13 @@ int32_t __tgt_rtl_init_async_info(int32_t ID, __tgt_async_info **AsyncInfoPtr);
 int32_t __tgt_rtl_init_device_info(int32_t ID, __tgt_device_info *DeviceInfoPtr,
                                    const char **ErrStr);
 
+// lock/pin host memory
+int32_t __tgt_rtl_data_lock(int32_t ID, void *HstPtr, int64_t Size,
+                            void **LockedPtr);
+
+// unlock/unpin host memory
+int32_t __tgt_rtl_data_unlock(int32_t ID, void *HstPtr);
+
 #ifdef __cplusplus
 }
 #endif

diff  --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h
index a7367312e88e9..4c881fe2e0965 100644
--- a/openmp/libomptarget/include/rtl.h
+++ b/openmp/libomptarget/include/rtl.h
@@ -76,6 +76,8 @@ struct RTLInfoTy {
   typedef int32_t(init_async_info_ty)(int32_t, __tgt_async_info **);
   typedef int64_t(init_device_into_ty)(int64_t, __tgt_device_info *,
                                        const char **);
+  typedef int32_t(data_lock_ty)(int32_t, void *, int64_t, void **);
+  typedef int32_t(data_unlock_ty)(int32_t, void *);
 
   int32_t Idx = -1;             // RTL index, index is the number of devices
                                 // of other RTLs that were registered before,
@@ -127,6 +129,8 @@ struct RTLInfoTy {
   init_async_info_ty *init_async_info = nullptr;
   init_device_into_ty *init_device_info = nullptr;
   release_async_info_ty *release_async_info = nullptr;
+  data_lock_ty *data_lock = nullptr;
+  data_unlock_ty *data_unlock = nullptr;
 
   // Are there images associated with this RTL.
   bool IsUsed = false;

diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
index ced75fa6c1cef..5299e3d238149 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
@@ -12,6 +12,36 @@
  * Data
  */
 
+hsa_status_t is_locked(void *ptr, void **agentBaseAddress) {
+  hsa_status_t err = HSA_STATUS_SUCCESS;
+  hsa_amd_pointer_info_t info;
+  info.size = sizeof(hsa_amd_pointer_info_t);
+  err = hsa_amd_pointer_info(ptr, &info, /*alloc=*/nullptr,
+                             /*num_agents_accessible=*/nullptr,
+                             /*accessible=*/nullptr);
+  if (err != HSA_STATUS_SUCCESS) {
+    DP("Error when getting pointer info\n");
+    return err;
+  }
+
+  if (info.type == HSA_EXT_POINTER_TYPE_LOCKED) {
+    // When user passes in a basePtr+offset we need to fix the
+    // locked pointer to include the offset: ROCr always returns
+    // the base locked address, not the shifted one.
+    if ((char *)info.hostBaseAddress <= (char *)ptr &&
+        (char *)ptr < (char *)info.hostBaseAddress + info.sizeInBytes)
+      *agentBaseAddress =
+          (void *)((uint64_t)info.agentBaseAddress + (uint64_t)ptr -
+                   (uint64_t)info.hostBaseAddress);
+    else // address is already device-agent accessible, no need to compute
+         // offset
+      *agentBaseAddress = ptr;
+  } else
+    *agentBaseAddress = nullptr;
+
+  return HSA_STATUS_SUCCESS;
+}
+
 // host pointer (either src or dest) must be locked via hsa_amd_memory_lock
 static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
                                     hsa_agent_t agent, const void *src,
@@ -49,12 +79,21 @@ static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
                                          hsa_signal_t signal, void *dest,
                                          hsa_agent_t agent, void *src,
                                          void *lockingPtr, size_t size) {
-  hsa_status_t err;
-
   void *lockedPtr = nullptr;
-  err = hsa_amd_memory_lock(lockingPtr, size, nullptr, 0, (void **)&lockedPtr);
+  hsa_status_t err = is_locked(lockingPtr, &lockedPtr);
+  bool HostPtrIsLocked = true;
   if (err != HSA_STATUS_SUCCESS)
     return err;
+  if (!lockedPtr) { // not locked
+    HostPtrIsLocked = false;
+    hsa_agent_t agents[1] = {agent};
+    err = hsa_amd_memory_lock(lockingPtr, size, agents, /*num_agent=*/1,
+                              (void **)&lockedPtr);
+    if (err != HSA_STATUS_SUCCESS)
+      return err;
+    DP("locking_async_memcpy: lockingPtr=%p lockedPtr=%p Size = %lu\n",
+       lockingPtr, lockedPtr, size);
+  }
 
   switch (direction) {
   case H2D:
@@ -65,13 +104,16 @@ static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
     break;
   }
 
-  if (err != HSA_STATUS_SUCCESS) {
+  if (err != HSA_STATUS_SUCCESS && !HostPtrIsLocked) {
     // do not leak locked host pointers, but discard potential error message
+    // because the initial error was in the copy function
     hsa_amd_memory_unlock(lockingPtr);
     return err;
   }
 
-  err = hsa_amd_memory_unlock(lockingPtr);
+  // unlock only if not user locked
+  if (!HostPtrIsLocked)
+    err = hsa_amd_memory_unlock(lockingPtr);
   if (err != HSA_STATUS_SUCCESS)
     return err;
 

diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
index c99b3e752ccec..671044089e107 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
@@ -12,6 +12,9 @@
 
 extern "C" {
 
+// Check if pointer ptr is already locked
+hsa_status_t is_locked(void *ptr, void **agentBaseAddress);
+
 hsa_status_t impl_module_register_from_memory_to_place(
     void *module_bytes, size_t module_size, int DeviceId,
     hsa_status_t (*on_deserialized_data)(void *data, size_t size,

diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 033862e13173a..e8bce775e9581 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -1816,6 +1816,35 @@ bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) {
   return (Rc == 0) && (SI.Addr != nullptr);
 }
 
+hsa_status_t lock_memory(void *HostPtr, size_t Size, hsa_agent_t Agent,
+                         void **LockedHostPtr) {
+  hsa_status_t err = is_locked(HostPtr, LockedHostPtr);
+  if (err != HSA_STATUS_SUCCESS)
+    return err;
+
+  // HostPtr is already locked, just return it
+  if (*LockedHostPtr)
+    return HSA_STATUS_SUCCESS;
+
+  hsa_agent_t Agents[1] = {Agent};
+  return hsa_amd_memory_lock(HostPtr, Size, Agents, /*num_agent=*/1,
+                             LockedHostPtr);
+}
+
+hsa_status_t unlock_memory(void *HostPtr) {
+  void *LockedHostPtr = nullptr;
+  hsa_status_t err = is_locked(HostPtr, &LockedHostPtr);
+  if (err != HSA_STATUS_SUCCESS)
+    return err;
+
+  // if LockedHostPtr is nullptr, then HostPtr was not locked
+  if (!LockedHostPtr)
+    return HSA_STATUS_SUCCESS;
+
+  err = hsa_amd_memory_unlock(HostPtr);
+  return err;
+}
+
 } // namespace
 
 namespace core {
@@ -2589,4 +2618,32 @@ void __tgt_rtl_print_device_info(int32_t DeviceId) {
   DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]);
 }
 
+int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *HostPtr, int64_t Size,
+                            void **LockedHostPtr) {
+  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
+
+  hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
+  hsa_status_t err = lock_memory(HostPtr, Size, Agent, LockedHostPtr);
+  if (err != HSA_STATUS_SUCCESS) {
+    DP("Error in tgt_rtl_data_lock\n");
+    return OFFLOAD_FAIL;
+  }
+  DP("Tgt lock host data %ld bytes, (HostPtr:%016llx).\n", Size,
+     (long long unsigned)(Elf64_Addr)*LockedHostPtr);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_unlock(int DeviceId, void *HostPtr) {
+  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
+  hsa_status_t err = unlock_memory(HostPtr);
+  if (err != HSA_STATUS_SUCCESS) {
+    DP("Error in tgt_rtl_data_unlock\n");
+    return OFFLOAD_FAIL;
+  }
+
+  DP("Tgt unlock data (tgt:%016llx).\n",
+     (long long unsigned)(Elf64_Addr)HostPtr);
+  return OFFLOAD_SUCCESS;
+}
+
 } // extern "C"

diff  --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index f96a2be2146e8..47c9d5e2f1eff 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -82,6 +82,15 @@ EXTERN void llvm_omp_target_free_shared(void *Ptre, int DeviceNum) {
 EXTERN void *llvm_omp_target_dynamic_shared_alloc() { return nullptr; }
 EXTERN void *llvm_omp_get_dynamic_shared() { return nullptr; }
 
+EXTERN [[nodiscard]] void *llvm_omp_target_lock_mem(void *Ptr, size_t Size,
+                                                    int DeviceNum) {
+  return targetLockExplicit(Ptr, Size, DeviceNum, __func__);
+}
+
+EXTERN void llvm_omp_target_unlock_mem(void *Ptr, int DeviceNum) {
+  targetUnlockExplicit(Ptr, DeviceNum, __func__);
+}
+
 EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
   TIMESCOPE();
   DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n",

diff  --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports
index 94a3ccabf5804..5fa013dcbb839 100644
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -49,6 +49,8 @@ VERS1.0 {
     llvm_omp_target_free_shared;
     llvm_omp_target_free_device;
     llvm_omp_target_dynamic_shared_alloc;
+    llvm_omp_target_lock_mem;
+    llvm_omp_target_unlock_mem;
     __tgt_set_info_flag;
     __tgt_print_device_info;
     omp_get_interop_ptr;

diff  --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index f9c5b95266211..3476e2d4a2e8f 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -425,6 +425,80 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
   DP("omp_target_free deallocated device ptr\n");
 }
 
+void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
+                         const char *Name) {
+  TIMESCOPE();
+  DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
+
+  if (Size <= 0) {
+    DP("Call to %s with non-positive length\n", Name);
+    return NULL;
+  }
+
+  void *rc = NULL;
+
+  if (!deviceIsReady(DeviceNum)) {
+    DP("%s returns NULL ptr\n", Name);
+    return NULL;
+  }
+
+  DeviceTy *DevicePtr = nullptr;
+  {
+    std::lock_guard<decltype(PM->RTLsMtx)> LG(PM->RTLsMtx);
+
+    if (!PM->Devices[DeviceNum]) {
+      DP("%s returns, device %d not available\n", Name, DeviceNum);
+      return nullptr;
+    }
+
+    DevicePtr = PM->Devices[DeviceNum].get();
+  }
+
+  int32_t err = 0;
+  if (DevicePtr->RTL->data_lock) {
+    err = DevicePtr->RTL->data_lock(DeviceNum, HostPtr, Size, &rc);
+    if (err) {
+      DP("Could not lock ptr %p\n", HostPtr);
+      return nullptr;
+    }
+  }
+  DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(rc));
+  return rc;
+}
+
+void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
+  TIMESCOPE();
+  DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
+
+  DeviceTy *DevicePtr = nullptr;
+  {
+    std::lock_guard<decltype(PM->RTLsMtx)> LG(PM->RTLsMtx);
+
+    // Don't check deviceIsReady as it can initialize the device if needed.
+    // Just check if DeviceNum exists as targetUnlockExplicit can be called
+    // during process exit/free (and it may have been already destroyed) and
+    // targetAllocExplicit will have already checked deviceIsReady anyway.
+    size_t DevicesSize = PM->Devices.size();
+
+    if (DevicesSize <= (size_t)DeviceNum) {
+      DP("Device ID  %d does not have a matching RTL\n", DeviceNum);
+      return;
+    }
+
+    if (!PM->Devices[DeviceNum]) {
+      DP("%s returns, device %d not available\n", Name, DeviceNum);
+      return;
+    }
+
+    DevicePtr = PM->Devices[DeviceNum].get();
+  } // unlock RTLsMtx
+
+  if (DevicePtr->RTL->data_unlock)
+    DevicePtr->RTL->data_unlock(DeviceNum, HostPtr);
+
+  DP("%s returns\n", Name);
+}
+
 /// Call the user-defined mapper function followed by the appropriate
 // targetData* function (targetData{Begin,End,Update}).
 int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,

diff  --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index 488053beb9b62..6fc47f8d137f8 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -51,6 +51,10 @@ extern void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
                                  const char *Name);
 extern void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
                                const char *Name);
+extern void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
+                                const char *Name);
+extern void targetUnlockExplicit(void *HostPtr, int DeviceNum,
+                                 const char *Name);
 
 // This structure stores information of a mapped memory region.
 struct MapComponentInfoTy {

diff  --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
index 4c7f5985f080a..6f84aa238976c 100644
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -246,6 +246,10 @@ bool RTLsTy::attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL) {
       DynLibrary->getAddressOfSymbol("__tgt_rtl_init_async_info");
   *((void **)&RTL.init_device_info) =
       DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device_info");
+  *((void **)&RTL.data_lock) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_data_lock");
+  *((void **)&RTL.data_unlock) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_data_unlock");
 
   RTL.LibraryHandler = std::move(DynLibrary);
 

diff  --git a/openmp/libomptarget/test/mapping/prelock.cpp b/openmp/libomptarget/test/mapping/prelock.cpp
new file mode 100644
index 0000000000000..7d88e5864aa08
--- /dev/null
+++ b/openmp/libomptarget/test/mapping/prelock.cpp
@@ -0,0 +1,67 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cstdio>
+
+#include <omp.h>
+
+extern "C" {
+void *llvm_omp_target_lock_mem(void *ptr, size_t size, int device_num);
+void llvm_omp_target_unlock_mem(void *ptr, int device_num);
+}
+
+int main() {
+  int n = 100;
+  int *unlocked = new int[n];
+
+  for (int i = 0; i < n; i++)
+    unlocked[i] = i;
+
+  int *locked = (int *)llvm_omp_target_lock_mem(unlocked, n * sizeof(int),
+                                                omp_get_default_device());
+  if (!locked)
+    return 0;
+
+#pragma omp target teams distribute parallel for map(tofrom : unlocked[ : n])
+  for (int i = 0; i < n; i++)
+    unlocked[i] += 1;
+
+#pragma omp target teams distribute parallel for map(tofrom : unlocked[10 : 10])
+  for (int i = 10; i < 20; i++)
+    unlocked[i] += 1;
+
+#pragma omp target teams distribute parallel for map(tofrom : locked[ : n])
+  for (int i = 0; i < n; i++)
+    locked[i] += 1;
+
+#pragma omp target teams distribute parallel for map(tofrom : locked[10 : 10])
+  for (int i = 10; i < 20; i++)
+    locked[i] += 1;
+
+  llvm_omp_target_unlock_mem(unlocked, omp_get_default_device());
+
+  int err = 0;
+  for (int i = 0; i < n; i++) {
+    if (i < 10 || i > 19) {
+      if (unlocked[i] != i + 2) {
+        printf("Err at %d, got %d, expected %d\n", i, unlocked[i], i + 1);
+        err++;
+      }
+    } else if (unlocked[i] != i + 4) {
+      printf("Err at %d, got %d, expected %d\n", i, unlocked[i], i + 2);
+      err++;
+    }
+  }
+
+  // CHECK: PASS
+  if (err == 0)
+    printf("PASS\n");
+
+  return err;
+}

diff  --git a/openmp/runtime/src/kmp_alloc.cpp b/openmp/runtime/src/kmp_alloc.cpp
index ea2527b77b664..c602dc0682715 100644
--- a/openmp/runtime/src/kmp_alloc.cpp
+++ b/openmp/runtime/src/kmp_alloc.cpp
@@ -1371,6 +1371,9 @@ void __kmp_init_target_mem() {
       kmp_target_alloc_host && kmp_target_alloc_shared &&
       kmp_target_alloc_device && kmp_target_free_host &&
       kmp_target_free_shared && kmp_target_free_device;
+  // lock/pin and unlock/unpin target calls
+  *(void **)(&kmp_target_lock_mem) = KMP_DLSYM("llvm_omp_target_lock_mem");
+  *(void **)(&kmp_target_unlock_mem) = KMP_DLSYM("llvm_omp_target_unlock_mem");
 }
 
 omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,


        


More information about the Openmp-commits mailing list