[llvm] [Offload] Add `olMemcpyRect` with amdgpu implementation (PR #160321)
Ross Brunton via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 24 02:54:36 PDT 2025
https://github.com/RossBrunton updated https://github.com/llvm/llvm-project/pull/160321
>From 9a6d6258830a8fe5470f24dc56f7e61be147ff0f Mon Sep 17 00:00:00 2001
From: Ross Brunton <bruntonross at protonmail.com>
Date: Tue, 23 Sep 2025 16:09:15 +0100
Subject: [PATCH] [Offload] Add `olMemcpyRect` with amdgpu implementation
This is a memcpy function that copies a 2D or 3D range rather than a
linear byte count. This is an early version, and has several
limitations (some of which are temporary, some are permanent):
* Only amdgpu is supported (CUDA returns `UNSUPPORTED`).
* The queue is required.
* The pitch, slice and base address must be aligned to 4 bytes.
* All buffers must have been allocated through `olMemAlloc` (i.e., for
the amd runtime they must be "pinned").
* Host-to-host copies are not supported.
---
offload/include/Shared/APITypes.h | 9 +
offload/liboffload/API/Memory.td | 55 ++-
offload/liboffload/src/OffloadImpl.cpp | 30 ++
.../amdgpu/dynamic_hsa/hsa.cpp | 1 +
.../amdgpu/dynamic_hsa/hsa_ext_amd.h | 20 +
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 132 +++++++
.../common/include/PluginInterface.h | 22 ++
.../common/src/PluginInterface.cpp | 35 ++
offload/plugins-nextgen/cuda/src/rtl.cpp | 21 +
offload/plugins-nextgen/host/src/rtl.cpp | 21 +
offload/unittests/OffloadAPI/CMakeLists.txt | 3 +-
.../OffloadAPI/memory/olMemcpyRect.cpp | 358 ++++++++++++++++++
12 files changed, 705 insertions(+), 2 deletions(-)
create mode 100644 offload/unittests/OffloadAPI/memory/olMemcpyRect.cpp
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 8c150b6bfc2d4..e2387900740a5 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -126,6 +126,15 @@ struct KernelLaunchParamsTy {
/// Ptrs to the Data entries. Only strictly required for the host plugin.
void **Ptrs = nullptr;
};
+
+/// Rectangular range for rect memcpies. Should be the same layout as
+/// liboffload's `ol_memcpy_rect_t`.
+struct MemcpyRectTy {
+ void *Base;
+ uint32_t Offset[3];
+ size_t Pitch;
+ size_t Slice;
+};
}
#endif // OMPTARGET_SHARED_API_TYPES_H
diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td
index debda165d2b23..4bdaa1aa73a26 100644
--- a/offload/liboffload/API/Memory.td
+++ b/offload/liboffload/API/Memory.td
@@ -22,7 +22,8 @@ def ol_alloc_type_t : Enum {
def olMemAlloc : Function {
let desc = "Creates a memory allocation on the specified device.";
let details = [
- "All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory."
+ "All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory.",
+ "The returned memory allocation will be aligned at least to a 4 byte boundry.",
];
let params = [
Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>,
@@ -63,6 +64,58 @@ def olMemcpy : Function {
let returns = [];
}
+def ol_memcpy_rect_t : Struct {
+ let desc = "A 3D view into a buffer for `olMemcpyRect`";
+ let members = [
+ StructMember<"void*", "buffer", "the buffer backing this range">,
+ StructMember<"ol_dimensions_t", "offset", "byte coordinate offset into the space">,
+ StructMember<"size_t", "pitch", "the pitch of the buffer in bytes (i.e. how large each `x` row is)">,
+ StructMember<"size_t", "slice", "the slice of the buffer in bytes (i.e. how large each `pitch * y` plane is)">,
+ ];
+}
+
+def olMemcpyRect : Function {
+ let desc = "Enqueue a 2D or 3D memcpy operation.";
+ let details = [
+ "For host pointers, use the host device belonging to the OL_PLATFORM_BACKEND_HOST platform.",
+ "If a queue is specified, at least one device must be a non-host device",
+ "For both the source and destination, the base pointer, pitch and slice must all be aligned to 4 bytes",
+ "For 2D copies (where `Size.z` is 1), the slice value is ignored",
+ "Both the source and destination must have been allocated via `olMemAlloc`",
+ "Either the source or destination (or both) must have a non-host device",
+ "If a queue is not specified, the memcpy happens synchronously",
+ ];
+ let params = [
+ Param<"ol_queue_handle_t", "Queue", "handle of the queue.", PARAM_IN_OPTIONAL>,
+ Param<"ol_memcpy_rect_t", "DstRect", "pointer to copy to", PARAM_IN>,
+ Param<"ol_device_handle_t", "DstDevice", "device that DstPtr belongs to", PARAM_IN>,
+ Param<"ol_memcpy_rect_t", "SrcRect", "pointer to copy from", PARAM_IN>,
+ Param<"ol_device_handle_t", "SrcDevice", "device that SrcPtr belongs to", PARAM_IN>,
+ Param<"ol_dimensions_t", "Size", "size in bytes of data to copy", PARAM_IN>,
+ ];
+ let returns = [
+ Return<"OL_ERRC_INVALID_SIZE", [
+ "`DstRect.pitch % 4 > 0`",
+ "`DstRect.slice % 4 > 0`",
+ "`(uintptr_t)DstRect.buffer % 4 > 0`",
+ "`SrcRect.pitch % 4 > 0`",
+ "`SrcRect.slice % 4 > 0`",
+ "`(uintptr_t)SrcRect.buffer % 4 > 0`",
+ "`Size.x == 0 || Size.y == 0 || Size.z == 0`",
+ ]>,
+ Return<"OL_ERRC_INVALID_NULL_POINTER", [
+ "`DstRect.buffer == NULL`",
+ "`SrcRect.buffer == NULL`",
+ ]>,
+ Return<"OL_ERRC_INVALID_VALUE", [
+ "Either the source or destination was not allocated via `olMemAlloc`",
+ ]>,
+ Return<"OL_ERRC_INVALID_ARGUMENT", [
+ "Both arguments are the host device",
+ ]>
+ ];
+}
+
def olMemFill : Function {
let desc = "Fill memory with copies of the given pattern";
let details = [
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 5457fc50b9711..327cc8ebd9223 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -927,6 +927,36 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
return Error::success();
}
+Error olMemcpyRect_impl(ol_queue_handle_t Queue, ol_memcpy_rect_t DstRect,
+ ol_device_handle_t DstDevice, ol_memcpy_rect_t SrcRect,
+ ol_device_handle_t SrcDevice, ol_dimensions_t Size) {
+ auto Host = OffloadContext::get().HostDevice();
+ if (DstDevice == Host && SrcDevice == Host) {
+ return createOffloadError(
+ ErrorCode::INVALID_ARGUMENT,
+ "one of DstDevice and SrcDevice must be a non-host device");
+ }
+
+ // If no queue is given the memcpy will be synchronous
+ auto QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
+
+ static_assert(sizeof(ol_memcpy_rect_t) == sizeof(MemcpyRectTy));
+ auto AsPIDst = bit_cast<MemcpyRectTy>(DstRect);
+ auto AsPISrc = bit_cast<MemcpyRectTy>(SrcRect);
+ uint32_t AsPISize[3] = {Size.x, Size.y, Size.z};
+
+ if (DstDevice == Host)
+ return SrcDevice->Device->dataRetrieveRect(AsPIDst, AsPISrc, AsPISize,
+ QueueImpl);
+
+ if (SrcDevice == Host)
+ return DstDevice->Device->dataSubmitRect(AsPIDst, AsPISrc, AsPISize,
+ QueueImpl);
+
+ return DstDevice->Device->dataExchangeRect(AsPISrc, *DstDevice->Device,
+ AsPIDst, AsPISize, QueueImpl);
+}
+
Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize,
const void *PatternPtr, size_t FillSize) {
return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize,
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
index bc92f4a46a5c0..471aee954b7ab 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
@@ -59,6 +59,7 @@ DLWRAP(hsa_amd_agent_iterate_memory_pools, 3)
DLWRAP(hsa_amd_memory_pool_allocate, 4)
DLWRAP(hsa_amd_memory_pool_free, 1)
DLWRAP(hsa_amd_memory_async_copy, 8)
+DLWRAP(hsa_amd_memory_async_copy_rect, 10)
DLWRAP(hsa_amd_memory_pool_get_info, 3)
DLWRAP(hsa_amd_agents_allow_access, 4)
DLWRAP(hsa_amd_memory_lock, 5)
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 29cfe78082dbb..71bc8512f2f41 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -96,6 +96,26 @@ hsa_status_t hsa_amd_memory_async_copy(void *dst, hsa_agent_t dst_agent,
const hsa_signal_t *dep_signals,
hsa_signal_t completion_signal);
+enum hsa_amd_copy_direction_t {
+ hsaHostToHost = 0,
+ hsaHostToDevice = 1,
+ hsaDeviceToHost = 2,
+ hsaDeviceToDevice = 3,
+};
+
+typedef struct hsa_pitched_ptr_s {
+ void *base;
+ size_t pitch;
+ size_t slice;
+} hsa_pitched_ptr_t;
+
+hsa_status_t hsa_amd_memory_async_copy_rect(
+ const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
+ const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
+ const hsa_dim3_t *range, hsa_agent_t copy_agent,
+ hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
+ const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
+
hsa_status_t hsa_amd_agent_memory_pool_get_info(
hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
hsa_amd_agent_memory_pool_info_t attribute, void *value);
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 64470e9fabf46..44a860a73e4f3 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -197,6 +197,23 @@ static Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst,
#endif
}
+/// Dispatches an asynchronous 3D/2D memory copy.
+static Error asyncMemCopyRect(MemcpyRectTy Dst, MemcpyRectTy Src,
+ hsa_agent_t Agent, hsa_amd_copy_direction_t Dir,
+ uint32_t Size[3], uint32_t NumDepSignals,
+ const hsa_signal_t *DepSignals,
+ hsa_signal_t CompletionSignal) {
+ hsa_pitched_ptr_t SrcPitched{Src.Base, Src.Pitch, Src.Slice};
+ hsa_pitched_ptr_t DstPitched{Dst.Base, Dst.Pitch, Dst.Slice};
+
+ hsa_status_t S = hsa_amd_memory_async_copy_rect(
+ &DstPitched, reinterpret_cast<hsa_dim3_t *>(Dst.Offset), &SrcPitched,
+ reinterpret_cast<hsa_dim3_t *>(Src.Offset),
+ reinterpret_cast<hsa_dim3_t *>(Size), Agent, Dir, NumDepSignals,
+ DepSignals, CompletionSignal);
+ return Plugin::check(S, "error in hsa_amd_memory_async_copy_rect: %s");
+}
+
static Error getTargetTripleAndFeatures(hsa_agent_t Agent,
SmallVector<SmallString<32>> &Targets) {
auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
@@ -1365,6 +1382,33 @@ struct AMDGPUStreamTy {
OutputSignal->get());
}
+ /// Push an asynchronous 2D or 3D memory copy between pinned memory buffers.
+ Error pushPinnedMemoryCopyRectAsync(MemcpyRectTy Dst, MemcpyRectTy Src,
+ uint32_t CopySize[3],
+ hsa_amd_copy_direction_t Dir) {
+ // Retrieve an available signal for the operation's output.
+ AMDGPUSignalTy *OutputSignal = nullptr;
+ if (auto Err = SignalManager.getResource(OutputSignal))
+ return Err;
+ OutputSignal->reset();
+ OutputSignal->increaseUseCount();
+
+ std::lock_guard<std::mutex> Lock(Mutex);
+
+ // Consume stream slot and compute dependencies.
+ auto [Curr, InputSignal] = consume(OutputSignal);
+
+ // Issue the async memory copy.
+ if (InputSignal && InputSignal->load()) {
+ hsa_signal_t InputSignalRaw = InputSignal->get();
+ return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, Dir, CopySize, 1,
+ &InputSignalRaw, OutputSignal->get());
+ }
+
+ return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, Dir, CopySize, 0,
+ nullptr, OutputSignal->get());
+ }
+
/// Push an asynchronous memory copy device-to-host involving an unpinned
/// memory buffer. The operation consists of a two-step copy from the
/// device buffer to an intermediate pinned host buffer, and then, to a
@@ -1539,6 +1583,37 @@ struct AMDGPUStreamTy {
OutputSignal->get());
}
+ Error pushMemoryCopyD2DRectAsync(MemcpyRectTy Dst, MemcpyRectTy Src,
+ hsa_agent_t Agent, uint32_t CopySize[3]) {
+ AMDGPUSignalTy *OutputSignal;
+ if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
+ return Err;
+ OutputSignal->reset();
+ OutputSignal->increaseUseCount();
+
+ std::lock_guard<std::mutex> Lock(Mutex);
+
+ // Consume stream slot and compute dependencies.
+ auto [Curr, InputSignal] = consume(OutputSignal);
+
+ // The agents need to have access to the corresponding memory
+ // This is presently only true if the pointers were originally
+ // allocated by this runtime or the caller made the appropriate
+ // access calls.
+
+ // TODO: Cross device transfers might not work
+
+ if (InputSignal && InputSignal->load()) {
+ hsa_signal_t InputSignalRaw = InputSignal->get();
+ return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, hsaDeviceToDevice,
+ CopySize, 1, &InputSignalRaw,
+ OutputSignal->get());
+ }
+ return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, hsaDeviceToDevice,
+ CopySize, 0, nullptr,
+ OutputSignal->get());
+ }
+
Error pushHostCallback(void (*Callback)(void *), void *UserData) {
// Retrieve an available signal for the operation's output.
AMDGPUSignalTy *OutputSignal = nullptr;
@@ -2523,6 +2598,28 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
PinnedMemoryManager);
}
+ /// 2D/3D host to device transfer.
+ Error dataSubmitRectImpl(MemcpyRectTy TgtRect, MemcpyRectTy HstRect,
+ uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ AMDGPUStreamTy *Stream = nullptr;
+
+ // Use one-step asynchronous operation when host memory is already pinned.
+ if (void *PinnedPtr =
+ PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstRect.Base)) {
+ if (auto Err = getStream(AsyncInfoWrapper, Stream))
+ return Err;
+
+ HstRect.Base = PinnedPtr;
+ return Stream->pushPinnedMemoryCopyRectAsync(TgtRect, HstRect, Size,
+ hsaHostToDevice);
+ }
+
+ return Plugin::error(ErrorCode::INVALID_VALUE,
+ "AMDGPU doesn't support 2D/3D copies involving memory "
+ "not allocated with `olMemAlloc`");
+ }
+
/// Retrieve data from the device (device to host transfer).
Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
@@ -2583,6 +2680,27 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
PinnedMemoryManager);
}
+ /// 2D/3D device to host transfer.
+ Error dataRetrieveRectImpl(MemcpyRectTy HstRect, MemcpyRectTy TgtRect,
+ uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ AMDGPUStreamTy *Stream = nullptr;
+
+ if (void *PinnedPtr =
+ PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstRect.Base)) {
+ if (auto Err = getStream(AsyncInfoWrapper, Stream))
+ return Err;
+
+ HstRect.Base = PinnedPtr;
+ return Stream->pushPinnedMemoryCopyRectAsync(HstRect, TgtRect, Size,
+ hsaDeviceToHost);
+ }
+
+ return Plugin::error(ErrorCode::INVALID_VALUE,
+ "AMDGPU doesn't support 2D/3D copies involving memory "
+ "not allocated with `olMemAlloc`");
+ }
+
/// Exchange data between two devices within the plugin.
Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
void *DstPtr, int64_t Size,
@@ -2620,6 +2738,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
getAgent(), (uint64_t)Size);
}
+ /// 2D/3D device to device transfer.
+ Error dataExchangeRectImpl(MemcpyRectTy SrcRect,
+ GenericDeviceTy &DstGenericDevice,
+ MemcpyRectTy DstRect, uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
+ AMDGPUStreamTy *Stream = nullptr;
+
+ if (auto Err = getStream(AsyncInfoWrapper, Stream))
+ return Err;
+ return Stream->pushMemoryCopyD2DRectAsync(DstRect, SrcRect,
+ DstDevice.getAgent(), Size);
+ }
+
/// Insert a data fence between previous data operations and the following
/// operations. This is a no-op for AMDGPU devices as operations inserted into
/// a queue are in-order.
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 5620437716b31..9efd6639339e0 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -921,12 +921,25 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+ Error dataSubmitRect(MemcpyRectTy TgtRect, const MemcpyRectTy HstRect,
+ uint32_t Size[3], __tgt_async_info *AsyncInfo);
+ virtual Error dataSubmitRectImpl(const MemcpyRectTy TgtRect,
+ MemcpyRectTy HstRect, uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
/// Retrieve data from the device (device to host transfer).
Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size,
__tgt_async_info *AsyncInfo);
virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+ Error dataRetrieveRect(MemcpyRectTy HstRect, const MemcpyRectTy TgtRect,
+ uint32_t Size[3], __tgt_async_info *AsyncInfo);
+ virtual Error dataRetrieveRectImpl(MemcpyRectTy HstRect,
+ const MemcpyRectTy TgtRect,
+ uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
/// Instert a data fence between previous data operations and the following
/// operations if necessary for the device
virtual Error dataFence(__tgt_async_info *AsyncInfo) = 0;
@@ -940,6 +953,15 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+ Error dataExchangeRect(MemcpyRectTy SrcRect, GenericDeviceTy &DstDev,
+ const MemcpyRectTy DstRect, uint32_t Size[3],
+ __tgt_async_info *AsyncInfo);
+ virtual Error dataExchangeRectImpl(MemcpyRectTy SrcRect,
+ GenericDeviceTy &DstDev,
+ const MemcpyRectTy DstRect,
+ uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
/// Fill data on the device with a pattern from the host
Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
int64_t Size, __tgt_async_info *AsyncInfo);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 30b5db782370d..78823674eb146 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1423,6 +1423,17 @@ Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
return Err;
}
+Error GenericDeviceTy::dataSubmitRect(MemcpyRectTy TgtRect,
+ const MemcpyRectTy HstRect,
+ uint32_t Size[3],
+ __tgt_async_info *AsyncInfo) {
+ AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+
+ auto Err = dataSubmitRectImpl(TgtRect, HstRect, Size, AsyncInfoWrapper);
+ AsyncInfoWrapper.finalize(Err);
+ return Err;
+}
+
Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
int64_t Size, __tgt_async_info *AsyncInfo) {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
@@ -1432,6 +1443,17 @@ Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
return Err;
}
+Error GenericDeviceTy::dataRetrieveRect(MemcpyRectTy HstRect,
+ const MemcpyRectTy TgtRect,
+ uint32_t Size[3],
+ __tgt_async_info *AsyncInfo) {
+ AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+
+ auto Err = dataRetrieveRectImpl(HstRect, TgtRect, Size, AsyncInfoWrapper);
+ AsyncInfoWrapper.finalize(Err);
+ return Err;
+}
+
Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
void *DstPtr, int64_t Size,
__tgt_async_info *AsyncInfo) {
@@ -1442,6 +1464,19 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
return Err;
}
+Error GenericDeviceTy::dataExchangeRect(MemcpyRectTy SrcRect,
+ GenericDeviceTy &DstDev,
+ const MemcpyRectTy DstRect,
+ uint32_t Size[3],
+ __tgt_async_info *AsyncInfo) {
+ AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+
+ auto Err =
+ dataExchangeRectImpl(SrcRect, DstDev, DstRect, Size, AsyncInfoWrapper);
+ AsyncInfoWrapper.finalize(Err);
+ return Err;
+}
+
Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr,
int64_t PatternSize, int64_t Size,
__tgt_async_info *AsyncInfo) {
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b2f840113cff3..0448b9ba9873d 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -814,6 +814,13 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::check(Res, "error in cuMemcpyHtoDAsync: %s");
}
+ Error dataSubmitRectImpl(MemcpyRectTy TgtRect, MemcpyRectTy HstRect,
+ uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ return Plugin::error(ErrorCode::UNIMPLEMENTED,
+ "CUDA does not yet support 2D/3D copies");
+ }
+
/// Retrieve data from the device (device to host transfer).
Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
@@ -828,12 +835,26 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::check(Res, "error in cuMemcpyDtoHAsync: %s");
}
+ Error dataRetrieveRectImpl(MemcpyRectTy HstRect, MemcpyRectTy TgtRect,
+ uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ return Plugin::error(ErrorCode::UNIMPLEMENTED,
+ "CUDA does not yet support 2D/3D copies");
+ }
+
/// Exchange data between two devices directly. We may use peer access if
/// the CUDA devices and driver allow them.
Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+ Error dataExchangeRectImpl(MemcpyRectTy SrcRect, GenericDeviceTy &Dst,
+ MemcpyRectTy DstRect, uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ return Plugin::error(ErrorCode::UNIMPLEMENTED,
+ "CUDA does not yet support 2D/3D copies");
+ }
+
Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 44e2584fe53cc..66f4de83e9275 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -285,6 +285,13 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
return Plugin::success();
}
+ Error dataSubmitRectImpl(MemcpyRectTy TgtRect, MemcpyRectTy HstRect,
+ uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ return Plugin::error(ErrorCode::UNIMPLEMENTED,
+ "Host does not yet support 2D/3D copies");
+ }
+
/// Retrieve data from the device (device to host transfer).
Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
@@ -292,6 +299,13 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
return Plugin::success();
}
+ Error dataRetrieveRectImpl(MemcpyRectTy HstRect, MemcpyRectTy TgtRect,
+ uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ return Plugin::error(ErrorCode::UNIMPLEMENTED,
+ "Host does not yet support 2D/3D copies");
+ }
+
/// Exchange data between two devices within the plugin. This function is not
/// supported in this plugin.
Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
@@ -303,6 +317,13 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
"dataExchangeImpl not supported");
}
+ Error dataExchangeRectImpl(MemcpyRectTy SrcRect, GenericDeviceTy &Dst,
+ MemcpyRectTy DstRect, uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ return Plugin::error(ErrorCode::UNIMPLEMENTED,
+ "Host does not yet support 2D/3D copies");
+ }
+
/// Insert a data fence between previous data operations and the following
/// operations. This is a no-op for Host devices as operations inserted into
/// a queue are in-order.
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index ba35c1ee87aac..ae3abba27801f 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -27,7 +27,8 @@ add_offload_unittest("memory"
memory/olMemAlloc.cpp
memory/olMemFill.cpp
memory/olMemFree.cpp
- memory/olMemcpy.cpp)
+ memory/olMemcpy.cpp
+ memory/olMemcpyRect.cpp)
add_offload_unittest("platform"
platform/olGetPlatformInfo.cpp
diff --git a/offload/unittests/OffloadAPI/memory/olMemcpyRect.cpp b/offload/unittests/OffloadAPI/memory/olMemcpyRect.cpp
new file mode 100644
index 0000000000000..0118383d52d0b
--- /dev/null
+++ b/offload/unittests/OffloadAPI/memory/olMemcpyRect.cpp
@@ -0,0 +1,358 @@
+//===------- Offload API tests - olMemcpyRect ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+constexpr ol_dimensions_t FULL_SIZE = {16, 8, 4};
+constexpr size_t BYTES = FULL_SIZE.x * FULL_SIZE.y * FULL_SIZE.z;
+
+constexpr ol_dimensions_t COPY_SIZE = {4, 3, 2};
+constexpr ol_dimensions_t COPY_OFFSET = {8, 2, 1};
+
+struct olMemcpyRectTest : OffloadQueueTest {
+ void SetUp() override {
+ RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
+
+ ol_platform_handle_t Platform;
+ ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_PLATFORM,
+ sizeof(Platform), &Platform));
+ ol_platform_backend_t Backend;
+ ASSERT_SUCCESS(olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND,
+ sizeof(Backend), &Backend));
+ if (Backend == OL_PLATFORM_BACKEND_CUDA)
+ GTEST_SKIP() << "CUDA does not yet support this entry point\n";
+
+ Buff.fill('h');
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_HOST, BYTES, &HostPtr));
+ ASSERT_SUCCESS(
+ olMemcpy(nullptr, HostPtr, Device, Buff.data(), Host, BYTES));
+
+ Buff.fill('d');
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, BYTES, &DevicePtr));
+ ASSERT_SUCCESS(
+ olMemcpy(nullptr, DevicePtr, Device, Buff.data(), Host, BYTES));
+
+ Buff.fill('D');
+ ASSERT_SUCCESS(
+ olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, BYTES, &DevicePtr2));
+ ASSERT_SUCCESS(
+ olMemcpy(nullptr, DevicePtr2, Device, Buff.data(), Host, BYTES));
+
+ SrcRect.offset = DstRect.offset = COPY_OFFSET;
+ SrcRect.pitch = DstRect.pitch = FULL_SIZE.x;
+ SrcRect.slice = DstRect.slice = FULL_SIZE.y * FULL_SIZE.x;
+ }
+
+ void TearDown() override {
+ ASSERT_SUCCESS(olMemFree(HostPtr));
+ ASSERT_SUCCESS(olMemFree(DevicePtr));
+ }
+
+ void checkPattern(void *CheckBuffer, const char *Template) {
+ ASSERT_SUCCESS(
+ olMemcpy(nullptr, Buff.data(), Host, CheckBuffer, Device, BYTES));
+ bool Failed = false;
+
+ for (size_t I = 0; I < BYTES; I++) {
+ if (Buff[I] != Template[I]) {
+ ADD_FAILURE() << "Failure at location " << I << "\n";
+ Failed = true;
+ break;
+ }
+ }
+
+ if (Failed) {
+ std::cerr << "Expected:\n";
+ printSlices([&](size_t I) -> char { return Template[I]; });
+ std::cerr << "Got:\n";
+ printSlices([&](size_t I) -> char { return Buff[I]; });
+ std::cerr << "Delta:\n";
+ printSlices(
+ [&](size_t I) -> char { return Buff[I] == Template[I] ? '.' : 'X'; });
+ }
+ }
+
+ template <typename F> void printSlices(F Getter) {
+ for (size_t Y = 0; Y < FULL_SIZE.y; Y++) {
+ for (size_t Z = 0; Z < FULL_SIZE.z; Z++) {
+ for (size_t X = 0; X < FULL_SIZE.x; X++) {
+ std::cerr << Getter(X + (Y * FULL_SIZE.x) +
+ (Z * FULL_SIZE.y * FULL_SIZE.x));
+ }
+ std::cerr << " ";
+ }
+
+ std::cerr << "\n";
+ }
+ }
+
+ std::array<uint8_t, BYTES> Buff;
+ void *HostPtr;
+ void *DevicePtr;
+ void *DevicePtr2;
+ ol_memcpy_rect_t SrcRect;
+ ol_memcpy_rect_t DstRect;
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemcpyRectTest);
+
+TEST_P(olMemcpyRectTest, SuccessHtoD) {
+ DstRect.buffer = DevicePtr;
+ SrcRect.buffer = HostPtr;
+
+ ASSERT_SUCCESS(
+ olMemcpyRect(Queue, DstRect, Device, SrcRect, Host, COPY_SIZE));
+ ASSERT_SUCCESS(olSyncQueue(Queue));
+
+ // clang-format off
+ checkPattern(DevicePtr,
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "ddddddddhhhhdddd"
+ "ddddddddhhhhdddd"
+ "ddddddddhhhhdddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "ddddddddhhhhdddd"
+ "ddddddddhhhhdddd"
+ "ddddddddhhhhdddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ );
+ // clang-format on
+}
+
+TEST_P(olMemcpyRectTest, SuccessDtoH) {
+ DstRect.buffer = HostPtr;
+ SrcRect.buffer = DevicePtr;
+
+ ASSERT_SUCCESS(
+ olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+ ASSERT_SUCCESS(olSyncQueue(Queue));
+
+ // clang-format off
+ checkPattern(HostPtr,
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhddddhhhh"
+ "hhhhhhhhddddhhhh"
+ "hhhhhhhhddddhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhddddhhhh"
+ "hhhhhhhhddddhhhh"
+ "hhhhhhhhddddhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ "hhhhhhhhhhhhhhhh"
+ );
+ // clang-format on
+}
+
+TEST_P(olMemcpyRectTest, SuccessDtoD) {
+ DstRect.buffer = DevicePtr;
+ SrcRect.buffer = DevicePtr2;
+
+ ASSERT_SUCCESS(
+ olMemcpyRect(Queue, DstRect, Device, SrcRect, Device, COPY_SIZE));
+ ASSERT_SUCCESS(olSyncQueue(Queue));
+
+ // clang-format off
+ checkPattern(DevicePtr,
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "ddddddddDDDDdddd"
+ "ddddddddDDDDdddd"
+ "ddddddddDDDDdddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "ddddddddDDDDdddd"
+ "ddddddddDDDDdddd"
+ "ddddddddDDDDdddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ "dddddddddddddddd"
+ );
+ // clang-format on
+}
+
+TEST_P(olMemcpyRectTest, InvalidDstPtr) {
+ DstRect.buffer = nullptr;
+ SrcRect.buffer = HostPtr;
+
+ ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+ olMemcpyRect(Queue, DstRect, Device, SrcRect, Host, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidSrcPtr) {
+ DstRect.buffer = HostPtr;
+ SrcRect.buffer = nullptr;
+
+ ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+ olMemcpyRect(Queue, DstRect, Device, SrcRect, Host, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidDstDevice) {
+ DstRect.buffer = HostPtr;
+ SrcRect.buffer = DevicePtr;
+
+ ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+ olMemcpyRect(Queue, DstRect, nullptr, SrcRect, Host, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidSrcDevice) {
+ DstRect.buffer = HostPtr;
+ SrcRect.buffer = DevicePtr;
+
+ ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+ olMemcpyRect(Queue, DstRect, Host, SrcRect, nullptr, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidSize) {
+ DstRect.buffer = HostPtr;
+ SrcRect.buffer = DevicePtr;
+
+ ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+ olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, {0, 0, 0}));
+}
+
+TEST_P(olMemcpyRectTest, InvalidSrcPtrAlign) {
+ DstRect.buffer = HostPtr;
+ SrcRect.buffer = &static_cast<char *>(DevicePtr)[2];
+
+ ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+ olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidDstPtrAlign) {
+ DstRect.buffer = &static_cast<char *>(HostPtr)[2];
+ SrcRect.buffer = DevicePtr;
+
+ ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+ olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidDstPitchAlign) {
+ DstRect.buffer = HostPtr;
+ DstRect.pitch = 2;
+ SrcRect.buffer = DevicePtr;
+
+ ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+ olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidSrcPitchAlign) {
+ DstRect.buffer = HostPtr;
+ SrcRect.buffer = DevicePtr;
+ SrcRect.pitch = 2;
+
+ ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+ olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidDstSliceAlign) {
+ DstRect.buffer = HostPtr;
+ DstRect.slice = 2;
+ SrcRect.buffer = DevicePtr;
+
+ ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+ olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidSrcSliceAlign) {
+ DstRect.buffer = HostPtr;
+ SrcRect.buffer = DevicePtr;
+ SrcRect.slice = 2;
+
+ ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+ olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidDstUnalloc) {
+ DstRect.buffer = Buff.data();
+ SrcRect.buffer = DevicePtr;
+
+ ASSERT_ERROR(OL_ERRC_INVALID_VALUE,
+ olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidSrcUnalloc) {
+ DstRect.buffer = DevicePtr;
+ SrcRect.buffer = Buff.data();
+
+ ASSERT_ERROR(OL_ERRC_INVALID_VALUE,
+ olMemcpyRect(Queue, DstRect, Device, SrcRect, Host, COPY_SIZE));
+}
More information about the llvm-commits
mailing list