[llvm] [Offload] Add `olMemcpyRect` with amdgpu implementation (PR #160321)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 24 02:57:17 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Ross Brunton (RossBrunton)
<details>
<summary>Changes</summary>
This is a memcpy function that copies a 2D or 3D range rather than a
linear byte count. This is an early version, and has several
limitations (some of which are temporary, some are permanent):
* Only amdgpu is supported (CUDA returns `UNSUPPORTED`).
* The queue is required.
* The pitch, slice and base address must be aligned to 4 bytes.
* All buffers must have been allocated through `olMemAlloc` (i.e., for
the amd runtime they must be "pinned").
* Host-to-host copies are not supported.
---
Patch is 36.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/160321.diff
12 Files Affected:
- (modified) offload/include/Shared/APITypes.h (+9)
- (modified) offload/liboffload/API/Memory.td (+54-1)
- (modified) offload/liboffload/src/OffloadImpl.cpp (+30)
- (modified) offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp (+1)
- (modified) offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h (+20)
- (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+132)
- (modified) offload/plugins-nextgen/common/include/PluginInterface.h (+22)
- (modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (+35)
- (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+21)
- (modified) offload/plugins-nextgen/host/src/rtl.cpp (+21)
- (modified) offload/unittests/OffloadAPI/CMakeLists.txt (+2-1)
- (added) offload/unittests/OffloadAPI/memory/olMemcpyRect.cpp (+358)
``````````diff
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 8c150b6bfc2d4..e2387900740a5 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -126,6 +126,15 @@ struct KernelLaunchParamsTy {
/// Ptrs to the Data entries. Only strictly required for the host plugin.
void **Ptrs = nullptr;
};
+
+/// Rectangular range for rect memcpies. Should be the same layout as
+/// liboffload's `ol_memcpy_rect_t`.
+struct MemcpyRectTy {
+ void *Base;
+ uint32_t Offset[3];
+ size_t Pitch;
+ size_t Slice;
+};
}
#endif // OMPTARGET_SHARED_API_TYPES_H
diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td
index debda165d2b23..4bdaa1aa73a26 100644
--- a/offload/liboffload/API/Memory.td
+++ b/offload/liboffload/API/Memory.td
@@ -22,7 +22,8 @@ def ol_alloc_type_t : Enum {
def olMemAlloc : Function {
let desc = "Creates a memory allocation on the specified device.";
let details = [
- "All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory."
+ "All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory.",
+ "The returned memory allocation will be aligned at least to a 4 byte boundry.",
];
let params = [
Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>,
@@ -63,6 +64,58 @@ def olMemcpy : Function {
let returns = [];
}
+def ol_memcpy_rect_t : Struct {
+ let desc = "A 3D view into a buffer for `olMemcpyRect`";
+ let members = [
+ StructMember<"void*", "buffer", "the buffer backing this range">,
+ StructMember<"ol_dimensions_t", "offset", "byte coordinate offset into the space">,
+ StructMember<"size_t", "pitch", "the pitch of the buffer in bytes (i.e. how large each `x` row is)">,
+ StructMember<"size_t", "slice", "the slice of the buffer in bytes (i.e. how large each `pitch * y` plane is)">,
+ ];
+}
+
+def olMemcpyRect : Function {
+ let desc = "Enqueue a 2D or 3D memcpy operation.";
+ let details = [
+ "For host pointers, use the host device belonging to the OL_PLATFORM_BACKEND_HOST platform.",
+ "If a queue is specified, at least one device must be a non-host device",
+ "For both the source and destination, the base pointer, pitch and slice must all be aligned to 4 bytes",
+ "For 2D copies (where `Size.z` is 1), the slice value is ignored",
+ "Both the source and destination must have been allocated via `olMemAlloc`",
+ "Either the source or destination (or both) must have a non-host device",
+ "If a queue is not specified, the memcpy happens synchronously",
+ ];
+ let params = [
+ Param<"ol_queue_handle_t", "Queue", "handle of the queue.", PARAM_IN_OPTIONAL>,
+ Param<"ol_memcpy_rect_t", "DstRect", "pointer to copy to", PARAM_IN>,
+ Param<"ol_device_handle_t", "DstDevice", "device that DstPtr belongs to", PARAM_IN>,
+ Param<"ol_memcpy_rect_t", "SrcRect", "pointer to copy from", PARAM_IN>,
+ Param<"ol_device_handle_t", "SrcDevice", "device that SrcPtr belongs to", PARAM_IN>,
+ Param<"ol_dimensions_t", "Size", "size in bytes of data to copy", PARAM_IN>,
+ ];
+ let returns = [
+ Return<"OL_ERRC_INVALID_SIZE", [
+ "`DstRect.pitch % 4 > 0`",
+ "`DstRect.slice % 4 > 0`",
+ "`(uintptr_t)DstRect.buffer % 4 > 0`",
+ "`SrcRect.pitch % 4 > 0`",
+ "`SrcRect.slice % 4 > 0`",
+ "`(uintptr_t)SrcRect.buffer % 4 > 0`",
+ "`Size.x == 0 || Size.y == 0 || Size.z == 0`",
+ ]>,
+ Return<"OL_ERRC_INVALID_NULL_POINTER", [
+ "`DstRect.buffer == NULL`",
+ "`SrcRect.buffer == NULL`",
+ ]>,
+ Return<"OL_ERRC_INVALID_VALUE", [
+ "Either the source or destination was not allocated via `olMemAlloc`",
+ ]>,
+ Return<"OL_ERRC_INVALID_ARGUMENT", [
+ "Both arguments are the host device",
+ ]>
+ ];
+}
+
def olMemFill : Function {
let desc = "Fill memory with copies of the given pattern";
let details = [
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 5457fc50b9711..327cc8ebd9223 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -927,6 +927,36 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
return Error::success();
}
+Error olMemcpyRect_impl(ol_queue_handle_t Queue, ol_memcpy_rect_t DstRect,
+ ol_device_handle_t DstDevice, ol_memcpy_rect_t SrcRect,
+ ol_device_handle_t SrcDevice, ol_dimensions_t Size) {
+ auto Host = OffloadContext::get().HostDevice();
+ if (DstDevice == Host && SrcDevice == Host) {
+ return createOffloadError(
+ ErrorCode::INVALID_ARGUMENT,
+ "one of DstDevice and SrcDevice must be a non-host device");
+ }
+
+ // If no queue is given the memcpy will be synchronous
+ auto QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
+
+ static_assert(sizeof(ol_memcpy_rect_t) == sizeof(MemcpyRectTy));
+ auto AsPIDst = bit_cast<MemcpyRectTy>(DstRect);
+ auto AsPISrc = bit_cast<MemcpyRectTy>(SrcRect);
+ uint32_t AsPISize[3] = {Size.x, Size.y, Size.z};
+
+ if (DstDevice == Host)
+ return SrcDevice->Device->dataRetrieveRect(AsPIDst, AsPISrc, AsPISize,
+ QueueImpl);
+
+ if (SrcDevice == Host)
+ return DstDevice->Device->dataSubmitRect(AsPIDst, AsPISrc, AsPISize,
+ QueueImpl);
+
+ return DstDevice->Device->dataExchangeRect(AsPISrc, *DstDevice->Device,
+ AsPIDst, AsPISize, QueueImpl);
+}
+
Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize,
const void *PatternPtr, size_t FillSize) {
return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize,
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
index bc92f4a46a5c0..471aee954b7ab 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
@@ -59,6 +59,7 @@ DLWRAP(hsa_amd_agent_iterate_memory_pools, 3)
DLWRAP(hsa_amd_memory_pool_allocate, 4)
DLWRAP(hsa_amd_memory_pool_free, 1)
DLWRAP(hsa_amd_memory_async_copy, 8)
+DLWRAP(hsa_amd_memory_async_copy_rect, 10)
DLWRAP(hsa_amd_memory_pool_get_info, 3)
DLWRAP(hsa_amd_agents_allow_access, 4)
DLWRAP(hsa_amd_memory_lock, 5)
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 29cfe78082dbb..71bc8512f2f41 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -96,6 +96,26 @@ hsa_status_t hsa_amd_memory_async_copy(void *dst, hsa_agent_t dst_agent,
const hsa_signal_t *dep_signals,
hsa_signal_t completion_signal);
+enum hsa_amd_copy_direction_t {
+ hsaHostToHost = 0,
+ hsaHostToDevice = 1,
+ hsaDeviceToHost = 2,
+ hsaDeviceToDevice = 3,
+};
+
+typedef struct hsa_pitched_ptr_s {
+ void *base;
+ size_t pitch;
+ size_t slice;
+} hsa_pitched_ptr_t;
+
+hsa_status_t hsa_amd_memory_async_copy_rect(
+ const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
+ const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
+ const hsa_dim3_t *range, hsa_agent_t copy_agent,
+ hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
+ const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
+
hsa_status_t hsa_amd_agent_memory_pool_get_info(
hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
hsa_amd_agent_memory_pool_info_t attribute, void *value);
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 64470e9fabf46..44a860a73e4f3 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -197,6 +197,23 @@ static Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst,
#endif
}
+/// Dispatches an asynchronous 3D/2D memory copy.
+static Error asyncMemCopyRect(MemcpyRectTy Dst, MemcpyRectTy Src,
+ hsa_agent_t Agent, hsa_amd_copy_direction_t Dir,
+ uint32_t Size[3], uint32_t NumDepSignals,
+ const hsa_signal_t *DepSignals,
+ hsa_signal_t CompletionSignal) {
+ hsa_pitched_ptr_t SrcPitched{Src.Base, Src.Pitch, Src.Slice};
+ hsa_pitched_ptr_t DstPitched{Dst.Base, Dst.Pitch, Dst.Slice};
+
+ hsa_status_t S = hsa_amd_memory_async_copy_rect(
+ &DstPitched, reinterpret_cast<hsa_dim3_t *>(Dst.Offset), &SrcPitched,
+ reinterpret_cast<hsa_dim3_t *>(Src.Offset),
+ reinterpret_cast<hsa_dim3_t *>(Size), Agent, Dir, NumDepSignals,
+ DepSignals, CompletionSignal);
+ return Plugin::check(S, "error in hsa_amd_memory_async_copy_rect: %s");
+}
+
static Error getTargetTripleAndFeatures(hsa_agent_t Agent,
SmallVector<SmallString<32>> &Targets) {
auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
@@ -1365,6 +1382,33 @@ struct AMDGPUStreamTy {
OutputSignal->get());
}
+ /// Push an asynchronous 2D or 3D memory copy between pinned memory buffers.
+ Error pushPinnedMemoryCopyRectAsync(MemcpyRectTy Dst, MemcpyRectTy Src,
+ uint32_t CopySize[3],
+ hsa_amd_copy_direction_t Dir) {
+ // Retrieve an available signal for the operation's output.
+ AMDGPUSignalTy *OutputSignal = nullptr;
+ if (auto Err = SignalManager.getResource(OutputSignal))
+ return Err;
+ OutputSignal->reset();
+ OutputSignal->increaseUseCount();
+
+ std::lock_guard<std::mutex> Lock(Mutex);
+
+ // Consume stream slot and compute dependencies.
+ auto [Curr, InputSignal] = consume(OutputSignal);
+
+ // Issue the async memory copy.
+ if (InputSignal && InputSignal->load()) {
+ hsa_signal_t InputSignalRaw = InputSignal->get();
+ return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, Dir, CopySize, 1,
+ &InputSignalRaw, OutputSignal->get());
+ }
+
+ return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, Dir, CopySize, 0,
+ nullptr, OutputSignal->get());
+ }
+
/// Push an asynchronous memory copy device-to-host involving an unpinned
/// memory buffer. The operation consists of a two-step copy from the
/// device buffer to an intermediate pinned host buffer, and then, to a
@@ -1539,6 +1583,37 @@ struct AMDGPUStreamTy {
OutputSignal->get());
}
+ Error pushMemoryCopyD2DRectAsync(MemcpyRectTy Dst, MemcpyRectTy Src,
+ hsa_agent_t Agent, uint32_t CopySize[3]) {
+ AMDGPUSignalTy *OutputSignal;
+ if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
+ return Err;
+ OutputSignal->reset();
+ OutputSignal->increaseUseCount();
+
+ std::lock_guard<std::mutex> Lock(Mutex);
+
+ // Consume stream slot and compute dependencies.
+ auto [Curr, InputSignal] = consume(OutputSignal);
+
+ // The agents need to have access to the corresponding memory
+ // This is presently only true if the pointers were originally
+ // allocated by this runtime or the caller made the appropriate
+ // access calls.
+
+ // TODO: Cross device transfers might not work
+
+ if (InputSignal && InputSignal->load()) {
+ hsa_signal_t InputSignalRaw = InputSignal->get();
+ return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, hsaDeviceToDevice,
+ CopySize, 1, &InputSignalRaw,
+ OutputSignal->get());
+ }
+ return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, hsaDeviceToDevice,
+ CopySize, 0, nullptr,
+ OutputSignal->get());
+ }
+
Error pushHostCallback(void (*Callback)(void *), void *UserData) {
// Retrieve an available signal for the operation's output.
AMDGPUSignalTy *OutputSignal = nullptr;
@@ -2523,6 +2598,28 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
PinnedMemoryManager);
}
+ /// 2D/3D host to device transfer.
+ Error dataSubmitRectImpl(MemcpyRectTy TgtRect, MemcpyRectTy HstRect,
+ uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ AMDGPUStreamTy *Stream = nullptr;
+
+ // Use one-step asynchronous operation when host memory is already pinned.
+ if (void *PinnedPtr =
+ PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstRect.Base)) {
+ if (auto Err = getStream(AsyncInfoWrapper, Stream))
+ return Err;
+
+ HstRect.Base = PinnedPtr;
+ return Stream->pushPinnedMemoryCopyRectAsync(TgtRect, HstRect, Size,
+ hsaHostToDevice);
+ }
+
+ return Plugin::error(ErrorCode::INVALID_VALUE,
+ "AMDGPU doesn't support 2D/3D copies involving memory "
+ "not allocated with `olMemAlloc`");
+ }
+
/// Retrieve data from the device (device to host transfer).
Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
@@ -2583,6 +2680,27 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
PinnedMemoryManager);
}
+ /// 2D/3D device to host transfer.
+ Error dataRetrieveRectImpl(MemcpyRectTy HstRect, MemcpyRectTy TgtRect,
+ uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ AMDGPUStreamTy *Stream = nullptr;
+
+ if (void *PinnedPtr =
+ PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstRect.Base)) {
+ if (auto Err = getStream(AsyncInfoWrapper, Stream))
+ return Err;
+
+ HstRect.Base = PinnedPtr;
+ return Stream->pushPinnedMemoryCopyRectAsync(HstRect, TgtRect, Size,
+ hsaDeviceToHost);
+ }
+
+ return Plugin::error(ErrorCode::INVALID_VALUE,
+ "AMDGPU doesn't support 2D/3D copies involving memory "
+ "not allocated with `olMemAlloc`");
+ }
+
/// Exchange data between two devices within the plugin.
Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
void *DstPtr, int64_t Size,
@@ -2620,6 +2738,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
getAgent(), (uint64_t)Size);
}
+ /// 2D/3D device to device transfer.
+ Error dataExchangeRectImpl(MemcpyRectTy SrcRect,
+ GenericDeviceTy &DstGenericDevice,
+ MemcpyRectTy DstRect, uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+ AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
+ AMDGPUStreamTy *Stream = nullptr;
+
+ if (auto Err = getStream(AsyncInfoWrapper, Stream))
+ return Err;
+ return Stream->pushMemoryCopyD2DRectAsync(DstRect, SrcRect,
+ DstDevice.getAgent(), Size);
+ }
+
/// Insert a data fence between previous data operations and the following
/// operations. This is a no-op for AMDGPU devices as operations inserted into
/// a queue are in-order.
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 5620437716b31..9efd6639339e0 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -921,12 +921,25 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+ Error dataSubmitRect(MemcpyRectTy TgtRect, const MemcpyRectTy HstRect,
+ uint32_t Size[3], __tgt_async_info *AsyncInfo);
+ virtual Error dataSubmitRectImpl(const MemcpyRectTy TgtRect,
+ MemcpyRectTy HstRect, uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
/// Retrieve data from the device (device to host transfer).
Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size,
__tgt_async_info *AsyncInfo);
virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+ Error dataRetrieveRect(MemcpyRectTy HstRect, const MemcpyRectTy TgtRect,
+ uint32_t Size[3], __tgt_async_info *AsyncInfo);
+ virtual Error dataRetrieveRectImpl(MemcpyRectTy HstRect,
+ const MemcpyRectTy TgtRect,
+ uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
/// Instert a data fence between previous data operations and the following
/// operations if necessary for the device
virtual Error dataFence(__tgt_async_info *AsyncInfo) = 0;
@@ -940,6 +953,15 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+ Error dataExchangeRect(MemcpyRectTy SrcRect, GenericDeviceTy &DstDev,
+ const MemcpyRectTy DstRect, uint32_t Size[3],
+ __tgt_async_info *AsyncInfo);
+ virtual Error dataExchangeRectImpl(MemcpyRectTy SrcRect,
+ GenericDeviceTy &DstDev,
+ const MemcpyRectTy DstRect,
+ uint32_t Size[3],
+ AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
/// Fill data on the device with a pattern from the host
Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
int64_t Size, __tgt_async_info *AsyncInfo);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 30b5db782370d..78823674eb146 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1423,6 +1423,17 @@ Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
return Err;
}
+Error GenericDeviceTy::dataSubmitRect(MemcpyRectTy TgtRect,
+ const MemcpyRectTy HstRect,
+ uint32_t Size[3],
+ __tgt_async_info *AsyncInfo) {
+ AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+
+ auto Err = dataSubmitRectImpl(TgtRect, HstRect, Size, AsyncInfoWrapper);
+ AsyncInfoWrapper.finalize(Err);
+ return Err;
+}
+
Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
int64_t Size, __tgt_async_info *AsyncInfo) {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
@@ -1432,6 +1443,17 @@ Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
return Err;
}
+Error GenericDeviceTy::dataRetrieveRect(MemcpyRectTy HstRect,
+ const MemcpyRectTy TgtRect,
+ uint32_t Size[3],
+ __tgt_async_info *AsyncInfo) {
+ AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+
+ auto E...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/160321
More information about the llvm-commits
mailing list