[Openmp-commits] [openmp] d6a3d6b - [openmp] Fixed Support for VA for record-replay. (#70396)
via Openmp-commits
openmp-commits at lists.llvm.org
Sun Oct 29 12:27:23 PDT 2023
Author: Konstantinos Parasyris
Date: 2023-10-29T12:27:19-07:00
New Revision: d6a3d6b96de64135768ba74b2deea515d7a6fb55
URL: https://github.com/llvm/llvm-project/commit/d6a3d6b96de64135768ba74b2deea515d7a6fb55
DIFF: https://github.com/llvm/llvm-project/commit/d6a3d6b96de64135768ba74b2deea515d7a6fb55.diff
LOG: [openmp] Fixed Support for VA for record-replay. (#70396)
The commit was discussed in phabricator
(https://reviews.llvm.org/D157186).
Record replay currently fails on AMD as it conflicts with the heap
memory allocator introduced in #69806. The workaround is setting
`LIBOMPTARGET_HEAP_SIZE=0` during both record and replay run.
Added:
Modified:
openmp/libomptarget/include/Utilities.h
openmp/libomptarget/include/omptarget.h
openmp/libomptarget/include/rtl.h
openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
openmp/libomptarget/src/device.cpp
openmp/libomptarget/src/interface.cpp
openmp/libomptarget/src/omptarget.cpp
openmp/libomptarget/src/private.h
openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/include/Utilities.h b/openmp/libomptarget/include/Utilities.h
index 82593e206e4d032..84d38d05911f712 100644
--- a/openmp/libomptarget/include/Utilities.h
+++ b/openmp/libomptarget/include/Utilities.h
@@ -253,6 +253,11 @@ template <typename Ty> Ty *alignPtr(Ty *Ptr, int64_t Alignment) {
return std::align(Alignment, sizeof(char), Ptr, Space);
}
+/// Round up \p V to a \p Boundary.
+template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
+ return (V + Boundary - 1) / Boundary * Boundary;
+}
+
} // namespace target
} // namespace omp
} // namespace llvm
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index e1f0f77849fa206..de4a1935c28632e 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -439,7 +439,7 @@ void __tgt_set_info_flag(uint32_t);
int __tgt_print_device_info(int64_t DeviceId);
int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
- bool IsRecord, bool SaveOutput);
+ void *VAddr, bool IsRecord, bool SaveOutput);
#ifdef __cplusplus
}
diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h
index 782a46e27bf47e6..2272577684f0c6c 100644
--- a/openmp/libomptarget/include/rtl.h
+++ b/openmp/libomptarget/include/rtl.h
@@ -73,7 +73,8 @@ struct RTLInfoTy {
typedef int32_t(data_notify_mapped_ty)(int32_t, void *, int64_t);
typedef int32_t(data_notify_unmapped_ty)(int32_t, void *);
typedef int32_t(set_device_offset_ty)(int32_t);
- typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, bool, bool);
+ typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, void *, bool,
+ bool);
int32_t Idx = -1; // RTL index, index is the number of devices
// of other RTLs that were registered before,
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 5366fad0c862e7d..fbecb4963c4abcb 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2579,6 +2579,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
DeviceMemoryPoolSize = Value;
return Plugin::success();
}
+ Error getDeviceMemorySize(uint64_t &Value) override {
+ for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
+ if (Pool->isGlobal()) {
+ hsa_status_t Status =
+ Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
+ return Plugin::check(Status, "Error in getting device memory size: %s");
+ }
+ }
+ return Plugin::error("getDeviceMemorySize:: no global pool");
+ }
/// AMDGPU-specific function to get device attributes.
template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index e5ee3840a676886..200fa15cb9fb946 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -49,40 +49,87 @@ struct RecordReplayTy {
void *MemoryStart;
void *MemoryPtr;
size_t MemorySize;
+ size_t TotalSize;
GenericDeviceTy *Device;
std::mutex AllocationLock;
RRStatusTy Status;
bool ReplaySaveOutput;
- uint64_t DeviceMemorySize;
-
- // Record/replay pre-allocates the largest possible device memory using the
- // default kind.
- // TODO: Expand allocation to include other kinds (device, host, shared) and
- // possibly use a MemoryManager to track (de-)allocations for
- // storing/retrieving when recording/replaying.
- Error preallocateDeviceMemory(uint64_t DeviceMemorySize) {
- // Pre-allocate memory on device. Starts with 64GB and subtracts in steps
- // of 1GB until allocation succeeds.
- const size_t MAX_MEMORY_ALLOCATION = DeviceMemorySize;
+
+ void *suggestAddress(uint64_t MaxMemoryAllocation) {
+ // Get a valid pointer address for this system
+ void *Addr =
+ Device->allocate(1024, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
+ Device->free(Addr);
+ // Align Address to MaxMemoryAllocation
+ Addr = (void *)alignPtr((Addr), MaxMemoryAllocation);
+ return Addr;
+ }
+
+ Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) {
+ size_t ASize = MaxMemoryAllocation;
+
+ if (!VAddr && isRecording())
+ VAddr = suggestAddress(MaxMemoryAllocation);
+
+ DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr);
+
+ if (auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize))
+ return Err;
+
+ if (isReplaying() && VAddr != MemoryStart) {
+ return Plugin::error("Record-Replay cannot assign the"
+ "requested recorded address (%p, %p)",
+ VAddr, MemoryStart);
+ }
+
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
+ "Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart);
+
+ MemoryPtr = MemoryStart;
+ MemorySize = 0;
+ TotalSize = ASize;
+ return Plugin::success();
+ }
+
+ Error preAllocateHeuristic(uint64_t MaxMemoryAllocation, void *VAddr) {
+ const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
constexpr size_t STEP = 1024 * 1024 * 1024ULL;
MemoryStart = nullptr;
- for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) {
- MemoryStart =
- Device->allocate(Try, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
+ for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) {
+ MemoryStart = Device->allocate(TotalSize, /* HstPtr */ nullptr,
+ TARGET_ALLOC_DEFAULT);
if (MemoryStart)
break;
}
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
+ "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
+ MemoryStart);
+
if (!MemoryStart)
return Plugin::error("Allocating record/replay memory");
+ if (VAddr && VAddr != MemoryStart)
+ return Plugin::error("Cannot allocate recorded address");
+
MemoryPtr = MemoryStart;
MemorySize = 0;
return Plugin::success();
}
+ Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
+ if (Device->supportVAManagement())
+ return preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
+
+ uint64_t DevMemSize;
+ if (Device->getDeviceMemorySize(DevMemSize))
+ return Plugin::error("Cannot determine Device Memory Size");
+
+ return preAllocateHeuristic(DevMemSize, ReqVAddr);
+ }
+
void dumpDeviceMemory(StringRef Filename) {
ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
@@ -114,8 +161,7 @@ struct RecordReplayTy {
bool isSaveOutputEnabled() const { return ReplaySaveOutput; }
RecordReplayTy()
- : Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false),
- DeviceMemorySize(-1) {}
+ : Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false) {}
void saveImage(const char *Name, const DeviceImageTy &Image) {
SmallString<128> ImageName = {Name, ".image"};
@@ -197,6 +243,7 @@ struct RecordReplayTy {
JsonKernelInfo["LoopTripCount"] = LoopTripCount;
JsonKernelInfo["DeviceMemorySize"] = MemorySize;
JsonKernelInfo["DeviceId"] = Device->getDeviceId();
+ JsonKernelInfo["BumpAllocVAStart"] = (intptr_t)MemoryStart;
json::Array JsonArgPtrs;
for (int I = 0; I < NumArgs; ++I)
@@ -244,27 +291,33 @@ struct RecordReplayTy {
return Alloc;
}
- Error init(GenericDeviceTy *Device, uint64_t MemSize, RRStatusTy Status,
- bool SaveOutput) {
+ Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
+ RRStatusTy Status, bool SaveOutput) {
this->Device = Device;
this->Status = Status;
- this->DeviceMemorySize = MemSize;
this->ReplaySaveOutput = SaveOutput;
- if (auto Err = preallocateDeviceMemory(MemSize))
+ if (auto Err = preallocateDeviceMemory(MemSize, VAddr))
return Err;
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
"Record Replay Initialized (%p)"
" as starting address, %lu Memory Size"
" and set on status %s\n",
- MemoryStart, MemSize,
+ MemoryStart, TotalSize,
Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");
return Plugin::success();
}
- void deinit() { Device->free(MemoryStart); }
+ void deinit() {
+ if (Device->supportVAManagement()) {
+ if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
+ report_fatal_error("Error on releasing virtual memory space");
+ } else {
+ Device->free(MemoryStart);
+ }
+ }
} RecordReplay;
@@ -1184,6 +1237,19 @@ Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) {
return queryAsyncImpl(*AsyncInfo);
}
+Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
+ return Plugin::error("Device does not suppport VA Management");
+}
+
+Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) {
+ return Plugin::error("Device does not suppport VA Management");
+}
+
+Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
+ return Plugin::error(
+ "Mising getDeviceMemorySize impelmentation (required by RR-heuristic");
+}
+
Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
TargetAllocTy Kind) {
void *Alloc = nullptr;
@@ -1552,8 +1618,8 @@ int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
return Plugin::get().isDataExchangable(SrcDeviceId, DstDeviceId);
}
-int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId,
- uint64_t MemorySize, bool isRecord,
+int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
+ void *VAddr, bool isRecord,
bool SaveOutput) {
GenericPluginTy &Plugin = Plugin::get();
GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
@@ -1561,7 +1627,8 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId,
isRecord ? RecordReplayTy::RRStatusTy::RRRecording
: RecordReplayTy::RRStatusTy::RRReplaying;
- if (auto Err = RecordReplay.init(&Device, MemorySize, Status, SaveOutput)) {
+ if (auto Err =
+ RecordReplay.init(&Device, MemorySize, VAddr, Status, SaveOutput)) {
REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
"(Error: %s)\n",
MemorySize, toString(std::move(Err)).data());
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index e61b28b46267757..e6cfa3d3d6c11af 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -655,6 +655,21 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
Error queryAsync(__tgt_async_info *AsyncInfo);
virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0;
+ /// Check whether the architecture supports VA management
+ virtual bool supportVAManagement() const { return false; }
+
+ /// Get the total device memory size
+ virtual Error getDeviceMemorySize(uint64_t &DSize);
+
+ /// Allocates \p RSize bytes (rounded up to page size) and hints the driver to
+ /// map it to \p VAddr. The obtained address is stored in \p Addr. At return
+ /// \p RSize contains the actual size which can be equal or larger than the
+ /// requested size.
+ virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize);
+
+ /// De-allocates device memory and unmaps the virtual address \p VAddr
+ virtual Error memoryVAUnMap(void *VAddr, size_t Size);
+
/// Allocate data on the device or involving the device.
Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 2271b3aa90ddd15..3d0de0d5b2caff6 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -81,6 +81,16 @@ DLWRAP(cuEventDestroy, 1)
DLWRAP_FINALIZE()
+DLWRAP(cuMemUnmap, 2)
+DLWRAP(cuMemRelease, 1)
+DLWRAP(cuMemAddressFree, 2)
+DLWRAP(cuMemGetInfo, 2)
+DLWRAP(cuMemAddressReserve, 5)
+DLWRAP(cuMemMap, 5)
+DLWRAP(cuMemCreate, 4)
+DLWRAP(cuMemSetAccess, 4)
+DLWRAP(cuMemGetAllocationGranularity, 3)
+
#ifndef DYNAMIC_CUDA_PATH
#define DYNAMIC_CUDA_PATH "libcuda.so"
#endif
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 459236f7ecbd025..3e0307759924b21 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -26,6 +26,71 @@ typedef struct CUevent_st *CUevent;
#define CU_DEVICE_INVALID ((CUdevice)-2)
+typedef unsigned long long CUmemGenericAllocationHandle_v1;
+typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;
+
+#define CU_DEVICE_INVALID ((CUdevice)-2)
+
+typedef enum CUmemAllocationGranularity_flags_enum {
+ CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0,
+ CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1
+} CUmemAllocationGranularity_flags;
+
+typedef enum CUmemAccess_flags_enum {
+ CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0,
+ CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1,
+ CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3,
+ CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF
+} CUmemAccess_flags;
+
+typedef enum CUmemLocationType_enum {
+ CU_MEM_LOCATION_TYPE_INVALID = 0x0,
+ CU_MEM_LOCATION_TYPE_DEVICE = 0x1,
+ CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF
+} CUmemLocationType;
+
+typedef struct CUmemLocation_st {
+ CUmemLocationType type;
+ int id;
+} CUmemLocation_v1;
+typedef CUmemLocation_v1 CUmemLocation;
+
+typedef struct CUmemAccessDesc_st {
+ CUmemLocation location;
+ CUmemAccess_flags flags;
+} CUmemAccessDesc_v1;
+
+typedef CUmemAccessDesc_v1 CUmemAccessDesc;
+
+typedef enum CUmemAllocationType_enum {
+ CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
+ CU_MEM_ALLOCATION_TYPE_PINNED = 0x1,
+ CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF
+} CUmemAllocationType;
+
+typedef enum CUmemAllocationHandleType_enum {
+ CU_MEM_HANDLE_TYPE_NONE = 0x0,
+ CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,
+ CU_MEM_HANDLE_TYPE_WIN32 = 0x2,
+ CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4,
+ CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF
+} CUmemAllocationHandleType;
+
+typedef struct CUmemAllocationProp_st {
+ CUmemAllocationType type;
+ CUmemAllocationHandleType requestedHandleTypes;
+ CUmemLocation location;
+
+ void *win32HandleMetaData;
+ struct {
+ unsigned char compressionType;
+ unsigned char gpuDirectRDMACapable;
+ unsigned short usage;
+ unsigned char reserved[4];
+ } allocFlags;
+} CUmemAllocationProp_v1;
+typedef CUmemAllocationProp_v1 CUmemAllocationProp;
+
typedef enum cudaError_enum {
CUDA_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE = 1,
@@ -268,4 +333,21 @@ CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int);
CUresult cuEventSynchronize(CUevent);
CUresult cuEventDestroy(CUevent);
+CUresult cuMemUnmap(CUdeviceptr ptr, size_t size);
+CUresult cuMemRelease(CUmemGenericAllocationHandle handle);
+CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size);
+CUresult cuMemGetInfo(size_t *free, size_t *total);
+CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment,
+ CUdeviceptr addr, unsigned long long flags);
+CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
+ CUmemGenericAllocationHandle handle,
+ unsigned long long flags);
+CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+ const CUmemAllocationProp *prop, unsigned long long flags);
+CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
+ const CUmemAccessDesc *desc, size_t count);
+CUresult cuMemGetAllocationGranularity(size_t *granularity,
+ const CUmemAllocationProp *prop,
+ CUmemAllocationGranularity_flags option);
+
#endif
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index d3375b5a556bd8e..be34051bc96f974 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -517,6 +517,116 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::check(Res, "Error in cuStreamSynchronize: %s");
}
+ /// CUDA support VA management
+ bool supportVAManagement() const override { return true; }
+
+ /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda
+ /// driver to map it to \p VAddr. The obtained address is stored in \p Addr.
+ /// At return \p RSize contains the actual size
+ Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override {
+ CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
+ auto IHandle = DeviceMMaps.find(DVAddr);
+ size_t Size = *RSize;
+
+ if (Size == 0)
+ return Plugin::error("Memory Map Size must be larger than 0");
+
+ // Check if we have already mapped this address
+ if (IHandle != DeviceMMaps.end())
+ return Plugin::error("Address already memory mapped");
+
+ CUmemAllocationProp Prop = {};
+ size_t Granularity = 0;
+
+ size_t Free, Total;
+ CUresult Res = cuMemGetInfo(&Free, &Total);
+ if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s"))
+ return Err;
+
+ if (Size >= Free) {
+ *Addr = nullptr;
+ return Plugin::error(
+ "Canot map memory size larger than the available device memory");
+ }
+
+ // currently NVidia only supports pinned device types
+ Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+ Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+
+ Prop.location.id = DeviceId;
+ cuMemGetAllocationGranularity(&Granularity, &Prop,
+ CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+ if (auto Err =
+ Plugin::check(Res, "Error in cuMemGetAllocationGranularity: %s"))
+ return Err;
+
+ if (Granularity == 0)
+ return Plugin::error("Wrong device Page size");
+
+ // Ceil to page size.
+ Size = roundUp(Size, Granularity);
+
+ // Create a handler of our allocation
+ CUmemGenericAllocationHandle AHandle;
+ Res = cuMemCreate(&AHandle, Size, &Prop, 0);
+ if (auto Err = Plugin::check(Res, "Error in cuMemCreate: %s"))
+ return Err;
+
+ CUdeviceptr DevPtr = 0;
+ Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0);
+ if (auto Err = Plugin::check(Res, "Error in cuMemAddressReserve: %s"))
+ return Err;
+
+ Res = cuMemMap(DevPtr, Size, 0, AHandle, 0);
+ if (auto Err = Plugin::check(Res, "Error in cuMemMap: %s"))
+ return Err;
+
+ CUmemAccessDesc ADesc = {};
+ ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+ ADesc.location.id = DeviceId;
+ ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+ // Sets address
+ Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1);
+ if (auto Err = Plugin::check(Res, "Error in cuMemSetAccess: %s"))
+ return Err;
+
+ *Addr = reinterpret_cast<void *>(DevPtr);
+ *RSize = Size;
+ DeviceMMaps.insert({DevPtr, AHandle});
+ return Plugin::success();
+ }
+
+ /// De-allocates device memory and Unmaps the Virtual Addr
+ Error memoryVAUnMap(void *VAddr, size_t Size) override {
+ CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
+ auto IHandle = DeviceMMaps.find(DVAddr);
+ // Mapping does not exist
+ if (IHandle == DeviceMMaps.end()) {
+ return Plugin::error("Addr is not MemoryMapped");
+ }
+
+ if (IHandle == DeviceMMaps.end())
+ return Plugin::error("Addr is not MemoryMapped");
+
+ CUmemGenericAllocationHandle &AllocHandle = IHandle->second;
+
+ CUresult Res = cuMemUnmap(DVAddr, Size);
+ if (auto Err = Plugin::check(Res, "Error in cuMemUnmap: %s"))
+ return Err;
+
+ Res = cuMemRelease(AllocHandle);
+ if (auto Err = Plugin::check(Res, "Error in cuMemRelease: %s"))
+ return Err;
+
+ Res = cuMemAddressFree(DVAddr, Size);
+ if (auto Err = Plugin::check(Res, "Error in cuMemAddressFree: %s"))
+ return Err;
+
+ DeviceMMaps.erase(IHandle);
+ return Plugin::success();
+ }
+
/// Query for the completion of the pending operations on the async info.
Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
@@ -859,6 +969,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
Error setDeviceHeapSize(uint64_t Value) override {
return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
}
+ Error getDeviceMemorySize(uint64_t &Value) override {
+ CUresult Res = cuDeviceTotalMem(&Value, Device);
+ return Plugin::check(Res, "Error in getDeviceMemorySize %s");
+ }
/// CUDA-specific functions for getting and setting context limits.
Error setCtxLimit(CUlimit Kind, uint64_t Value) {
@@ -907,6 +1021,9 @@ struct CUDADeviceTy : public GenericDeviceTy {
/// The CUDA device handler.
CUdevice Device = CU_DEVICE_INVALID;
+ /// The memory mapped addresses and their handles
+ std::unordered_map<CUdeviceptr, CUmemGenericAllocationHandle> DeviceMMaps;
+
/// The compute capability of the corresponding CUDA device.
struct ComputeCapabilityTy {
uint32_t Major;
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 93d2157dbd4ee15..8a2fe4620b39cbe 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -482,7 +482,8 @@ void *DeviceTy::getTgtPtrBegin(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin,
int DeviceTy::eraseMapEntry(HDTTMapAccessorTy &HDTTMap,
HostDataToTargetTy *Entry, int64_t Size) {
assert(Entry && "Trying to delete a null entry from the HDTT map.");
- assert(Entry->getTotalRefCount() == 0 && Entry->getDataEndThreadCount() == 0 &&
+ assert(Entry->getTotalRefCount() == 0 &&
+ Entry->getDataEndThreadCount() == 0 &&
"Trying to delete entry that is in use or owned by another thread.");
INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID,
@@ -546,7 +547,7 @@ void DeviceTy::init() {
RTL->activate_record_replay(RTLDeviceID,
OMPX_DeviceMemorySize * 1024 * 1024 * 1024,
- true, OMPX_ReplaySaveOutput);
+ nullptr, true, OMPX_ReplaySaveOutput);
}
IsInit = true;
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 0b4a393405a4fca..e9ab7f05c7a0a76 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -346,7 +346,8 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
/// /param SaveOutput Store the device memory after kernel
/// execution on persistent storage
EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
- bool IsRecord, bool SaveOutput) {
+ void *VAddr, bool IsRecord,
+ bool SaveOutput) {
if (!deviceIsReady(DeviceId)) {
DP("Device %" PRId64 " is not ready\n", DeviceId);
return OMP_TGT_FAIL;
@@ -354,7 +355,7 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
DeviceTy &Device = *PM->Devices[DeviceId];
[[maybe_unused]] int Rc =
- target_activate_rr(Device, MemorySize, IsRecord, SaveOutput);
+ target_activate_rr(Device, MemorySize, VAddr, IsRecord, SaveOutput);
assert(Rc == OFFLOAD_SUCCESS &&
"__tgt_activate_record_replay unexpected failure!");
return OMP_TGT_SUCCESS;
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 40419e448942608..65f2a49abc714ce 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -827,14 +827,13 @@ postProcessingTargetDataEnd(DeviceTy *Device,
// remaining shadow pointer entries for this struct.
const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
if (HasFrom) {
- Entry->foreachShadowPointerInfo(
- [&](const ShadowPtrInfoTy &ShadowPtr) {
- *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
- DP("Restoring original host pointer value " DPxMOD " for host "
- "pointer " DPxMOD "\n",
- DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
- return OFFLOAD_SUCCESS;
- });
+ Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) {
+ *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
+ DP("Restoring original host pointer value " DPxMOD " for host "
+ "pointer " DPxMOD "\n",
+ DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
+ return OFFLOAD_SUCCESS;
+ });
}
// Give up the lock as we either don't need it anymore (e.g., done with
@@ -1713,9 +1712,9 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
/// Enables the record replay mechanism by pre-allocating MemorySize
/// and informing the record-replayer of whether to store the output
/// in some file.
-int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, bool isRecord,
- bool SaveOutput) {
- return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize,
+int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
+ bool isRecord, bool SaveOutput) {
+ return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize, VAddr,
isRecord, SaveOutput);
}
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index cf88f78696b2e61..2a06bdbd1b708c4 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -42,7 +42,7 @@ extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo);
extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
- bool isRecord, bool SaveOutput);
+ void *ReqAddr, bool isRecord, bool SaveOutput);
extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
void *DeviceMemory, int64_t DeviceMemorySize,
diff --git a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index 0de59e23634a567..93fc3e7853f8e9c 100644
--- a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -87,6 +87,9 @@ int main(int argc, char **argv) {
for (auto It : *TgtArgOffsetsArray)
TgtArgOffsets.push_back(static_cast<ptr
diff _t>(It.getAsInteger().value()));
+ void *BAllocStart = reinterpret_cast<void *>(
+ JsonKernelInfo->getAsObject()->getInteger("BumpAllocVAStart").value());
+
__tgt_offload_entry KernelEntry = {nullptr, nullptr, 0, 0, 0};
std::string KernelEntryName = KernelFunc.value().str();
KernelEntry.name = const_cast<char *>(KernelEntryName.c_str());
@@ -125,8 +128,8 @@ int main(int argc, char **argv) {
__tgt_register_lib(&Desc);
- int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, false,
- VerifyOpt);
+ int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, BAllocStart,
+ false, VerifyOpt);
if (Rc != OMP_TGT_SUCCESS) {
report_fatal_error("Cannot activate record replay\n");
More information about the Openmp-commits
mailing list