[Openmp-commits] [openmp] aa616ef - [libomptarget][AMDGPU][NFC] Split atmi_memcpy for h2d and d2h
Pushpinder Singh via Openmp-commits
openmp-commits at lists.llvm.org
Tue Oct 20 03:29:48 PDT 2020
Author: Pushpinder Singh
Date: 2020-10-20T06:29:32-04:00
New Revision: aa616efbb34e6321c0f24f61e017efdcf398ba04
URL: https://github.com/llvm/llvm-project/commit/aa616efbb34e6321c0f24f61e017efdcf398ba04
DIFF: https://github.com/llvm/llvm-project/commit/aa616efbb34e6321c0f24f61e017efdcf398ba04.diff
LOG: [libomptarget][AMDGPU][NFC] Split atmi_memcpy for h2d and d2h
The calls to atmi_memcpy presently determine the direction of copy (host to
device or device to host) by storing pointers in a map during malloc and
looking up the pointers during memcpy. As each call site already knows the
direction, this stash+lookup can be eliminated.
This NFC will be followed by a functional one that deletes those map lookups.
Reviewed By: JonChesterfield
Differential Revision: https://reviews.llvm.org/D89776
Change-Id: I1d9089bc1e56b3a9a30e334735fa07dee1f84990
Added:
Modified:
openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
index 6a6bc8c35c87..0586cd358c25 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
@@ -35,9 +35,28 @@ atmi_status_t atmi_module_register_from_memory_to_place(
*/
atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src,
size_t size) {
+ hsa_status_t rc = hsa_memory_copy(dest, src, size);
+
+ // hsa_memory_copy sometimes fails in situations where
+ // allocate + copy succeeds. Looks like it might be related to
+ // locking part of a read only segment. Fall back for now.
+ if (rc == HSA_STATUS_SUCCESS) {
+ return ATMI_STATUS_SUCCESS;
+ }
+
return core::Runtime::Memcpy(sig, dest, src, size);
}
+atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest,
+ const void *host_src, size_t size) {
+ return atmi_memcpy(sig, device_dest, host_src, size);
+}
+
+atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest,
+ const void *device_src, size_t size) {
+ return atmi_memcpy(sig, host_dest, device_src, size);
+}
+
atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); }
atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place) {
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
index 8ee78dfca3e4..a935b6ad4b75 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
@@ -155,53 +155,11 @@ atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place);
*/
atmi_status_t atmi_free(void *ptr);
-/**
- * @brief Syncrhonously copy memory from the source to destination memory
- * locations.
- *
- * @detail This function assumes that the source and destination regions are
- * non-overlapping. The runtime determines the memory place of the source and
- * the
- * destination and executes the appropriate optimized data movement methodology.
- *
- * @param[in] dest The destination pointer previously allocated by a system
- * allocator or @p atmi_malloc.
- *
- * @param[in] src The source pointer previously allocated by a system
- * allocator or @p atmi_malloc.
- *
- * @param[in] size The size of the data to be copied in bytes.
- *
- * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
- *
- * @retval ::ATMI_STATUS_ERROR The function encountered errors.
- *
- * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
- *
- */
-atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src,
- size_t size);
+atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest,
+ const void *host_src, size_t size);
-static inline atmi_status_t atmi_memcpy_no_signal(void *dest, const void *src,
- size_t size) {
- hsa_signal_t sig;
- hsa_status_t err = hsa_signal_create(0, 0, NULL, &sig);
- if (err != HSA_STATUS_SUCCESS) {
- return ATMI_STATUS_ERROR;
- }
-
- atmi_status_t r = atmi_memcpy(sig, dest, src, size);
- hsa_status_t rc = hsa_signal_destroy(sig);
-
- if (r != ATMI_STATUS_SUCCESS) {
- return r;
- }
- if (rc != HSA_STATUS_SUCCESS) {
- return ATMI_STATUS_ERROR;
- }
-
- return ATMI_STATUS_SUCCESS;
-}
+atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest,
+ const void *device_src, size_t size);
/** @} */
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index e0509a5f2b32..8fd7b7e34155 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -335,17 +335,29 @@ class RTLDeviceInfoTy {
static const int Default_WG_Size =
llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size];
- atmi_status_t freesignalpool_memcpy(void *dest, const void *src,
- size_t size) {
+ using MemcpyFunc = atmi_status_t(hsa_signal_t, void *, const void *,
+ size_t size);
+ atmi_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
+ MemcpyFunc Func) {
hsa_signal_t s = FreeSignalPool.pop();
if (s.handle == 0) {
return ATMI_STATUS_ERROR;
}
- atmi_status_t r = atmi_memcpy(s, dest, src, size);
+ atmi_status_t r = Func(s, dest, src, size);
FreeSignalPool.push(s);
return r;
}
+ atmi_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
+ size_t size) {
+ return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h);
+ }
+
+ atmi_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
+ size_t size) {
+ return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d);
+ }
+
// Record entry point associated with device
void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
assert(device_id < (int32_t)FuncGblEntries.size() &&
@@ -538,7 +550,7 @@ int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
(long long unsigned)(Elf64_Addr)TgtPtr,
(long long unsigned)(Elf64_Addr)HstPtr);
- err = DeviceInfo.freesignalpool_memcpy(HstPtr, TgtPtr, (size_t)Size);
+ err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size);
if (err != ATMI_STATUS_SUCCESS) {
DP("Error when copying data from device to host. Pointers: "
@@ -564,7 +576,7 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
(long long unsigned)(Elf64_Addr)HstPtr,
(long long unsigned)(Elf64_Addr)TgtPtr);
- err = DeviceInfo.freesignalpool_memcpy(TgtPtr, HstPtr, (size_t)Size);
+ err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size);
if (err != ATMI_STATUS_SUCCESS) {
DP("Error when copying data from host to device. Pointers: "
"host = 0x%016lx, device = 0x%016lx, size = %lld\n",
@@ -1021,7 +1033,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
}
// write ptr to device memory so it can be used by later kernels
- err = DeviceInfo.freesignalpool_memcpy(state_ptr, &ptr, sizeof(void *));
+ err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, sizeof(void *));
if (err != ATMI_STATUS_SUCCESS) {
fprintf(stderr, "memcpy install of state_ptr failed\n");
return NULL;
@@ -1090,7 +1102,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
// If unified memory is present any target link variables
// can access host addresses directly. There is no longer a
// need for device copies.
- err = DeviceInfo.freesignalpool_memcpy(varptr, e->addr, sizeof(void *));
+ err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
+ sizeof(void *));
if (err != ATMI_STATUS_SUCCESS)
DP("Error when copying USM\n");
DP("Copy linked variable host address (" DPxMOD ")"
@@ -1518,8 +1531,8 @@ static void *AllocateNestedParallelCallMemory(int MaxParLevel, int NumGroups,
void *TgtPtr = NULL;
atmi_status_t err =
atmi_malloc(&TgtPtr, NestedMemSize, get_gpu_mem_place(device_id));
- err =
- DeviceInfo.freesignalpool_memcpy(CallStackAddr, &TgtPtr, sizeof(void *));
+ err = DeviceInfo.freesignalpool_memcpy_h2d(CallStackAddr, &TgtPtr,
+ sizeof(void *));
if (print_kernel_trace > 2)
fprintf(stderr, "CallSck %lx TgtPtr %lx *TgtPtr %lx \n",
(long)CallStackAddr, (long)&TgtPtr, (long)TgtPtr);
More information about the Openmp-commits
mailing list