[llvm] [Offload] Add `olMemcpyRect` with amdgpu implementation (PR #160321)

Wed Sep 24 02:57:17 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Ross Brunton (RossBrunton)

<details>
<summary>Changes</summary>

This is a memcpy function that copies a 2D or 3D range rather than a
linear byte count. This is an early version, and has several
limitations (some of which are temporary, some are permanent):
* Only amdgpu is supported (CUDA returns `UNSUPPORTED`).
* The queue is required.
* The pitch, slice and base address must be aligned to 4 bytes.
* All buffers must have been allocated through `olMemAlloc` (i.e., for
  the amd runtime they must be "pinned").
* Host-to-host copies are not supported.


---

Patch is 36.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/160321.diff


12 Files Affected:

- (modified) offload/include/Shared/APITypes.h (+9) 
- (modified) offload/liboffload/API/Memory.td (+54-1) 
- (modified) offload/liboffload/src/OffloadImpl.cpp (+30) 
- (modified) offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp (+1) 
- (modified) offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h (+20) 
- (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+132) 
- (modified) offload/plugins-nextgen/common/include/PluginInterface.h (+22) 
- (modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (+35) 
- (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+21) 
- (modified) offload/plugins-nextgen/host/src/rtl.cpp (+21) 
- (modified) offload/unittests/OffloadAPI/CMakeLists.txt (+2-1) 
- (added) offload/unittests/OffloadAPI/memory/olMemcpyRect.cpp (+358) 


``````````diff

diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 8c150b6bfc2d4..e2387900740a5 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -126,6 +126,15 @@ struct KernelLaunchParamsTy {
   /// Ptrs to the Data entries. Only strictly required for the host plugin.
   void **Ptrs = nullptr;
 };
+
+/// Rectangular range for rect memcpies. Should be the same layout as
+/// liboffload's `ol_memcpy_rect_t`.
+struct MemcpyRectTy {
+  void *Base;
+  uint32_t Offset[3];
+  size_t Pitch;
+  size_t Slice;
+};
 }
 
 #endif // OMPTARGET_SHARED_API_TYPES_H
diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td
index debda165d2b23..4bdaa1aa73a26 100644
--- a/offload/liboffload/API/Memory.td
+++ b/offload/liboffload/API/Memory.td
@@ -22,7 +22,8 @@ def ol_alloc_type_t : Enum {
 def olMemAlloc : Function {
   let desc = "Creates a memory allocation on the specified device.";
   let details = [
-      "All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory."
+      "All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory.",
+      "The returned memory allocation will be aligned at least to a 4 byte boundry.",
   ];
   let params = [
     Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>,
@@ -63,6 +64,58 @@ def olMemcpy : Function {
     let returns = [];
 }
 
+def ol_memcpy_rect_t : Struct {
+  let desc = "A 3D view into a buffer for `olMemcpyRect`";
+  let members = [
+    StructMember<"void*", "buffer", "the buffer backing this range">,
+    StructMember<"ol_dimensions_t", "offset", "byte coordinate offset into the space">,
+    StructMember<"size_t", "pitch", "the pitch of the buffer in bytes (i.e. how large each `x` row is)">,
+    StructMember<"size_t", "slice", "the slice of the buffer in bytes (i.e. how large each `pitch * y` plane is)">,
+  ];
+}
+
+def olMemcpyRect : Function {
+    let desc = "Enqueue a 2D or 3D memcpy operation.";
+    let details = [
+        "For host pointers, use the host device belonging to the OL_PLATFORM_BACKEND_HOST platform.",
+        "If a queue is specified, at least one device must be a non-host device",
+        "For both the source and destination, the base pointer, pitch and slice must all be aligned to 4 bytes",
+        "For 2D copies (where `Size.z` is 1), the slice value is ignored",
+        "Both the source and destination must have been allocated via `olMemAlloc`",
+        "Either the source or destination (or both) must have a non-host device",
+        "If a queue is not specified, the memcpy happens synchronously",
+    ];
+    let params = [
+        Param<"ol_queue_handle_t", "Queue", "handle of the queue.", PARAM_IN_OPTIONAL>,
+        Param<"ol_memcpy_rect_t", "DstRect", "pointer to copy to", PARAM_IN>,
+        Param<"ol_device_handle_t", "DstDevice", "device that DstPtr belongs to", PARAM_IN>,
+        Param<"ol_memcpy_rect_t", "SrcRect", "pointer to copy from", PARAM_IN>,
+        Param<"ol_device_handle_t", "SrcDevice", "device that SrcPtr belongs to", PARAM_IN>,
+        Param<"ol_dimensions_t", "Size", "size in bytes of data to copy", PARAM_IN>,
+    ];
+    let returns = [
+      Return<"OL_ERRC_INVALID_SIZE", [
+        "`DstRect.pitch % 4 > 0`",
+        "`DstRect.slice % 4 > 0`",
+        "`(uintptr_t)DstRect.buffer % 4 > 0`",
+        "`SrcRect.pitch % 4 > 0`",
+        "`SrcRect.slice % 4 > 0`",
+        "`(uintptr_t)SrcRect.buffer % 4 > 0`",
+        "`Size.x == 0 || Size.y == 0 || Size.z == 0`",
+      ]>,
+      Return<"OL_ERRC_INVALID_NULL_POINTER", [
+        "`DstRect.buffer == NULL`",
+        "`SrcRect.buffer == NULL`",
+      ]>,
+      Return<"OL_ERRC_INVALID_VALUE", [
+        "Either the source or destination was not allocated via `olMemAlloc`",
+      ]>,
+      Return<"OL_ERRC_INVALID_ARGUMENT", [
+        "Both arguments are the host device",
+      ]>
+    ];
+}
+
 def olMemFill : Function {
   let desc = "Fill memory with copies of the given pattern";
   let details = [
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 5457fc50b9711..327cc8ebd9223 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -927,6 +927,36 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
   return Error::success();
 }
 
+Error olMemcpyRect_impl(ol_queue_handle_t Queue, ol_memcpy_rect_t DstRect,
+                        ol_device_handle_t DstDevice, ol_memcpy_rect_t SrcRect,
+                        ol_device_handle_t SrcDevice, ol_dimensions_t Size) {
+  auto Host = OffloadContext::get().HostDevice();
+  if (DstDevice == Host && SrcDevice == Host) {
+    return createOffloadError(
+        ErrorCode::INVALID_ARGUMENT,
+        "one of DstDevice and SrcDevice must be a non-host device");
+  }
+
+  // If no queue is given the memcpy will be synchronous
+  auto QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
+
+  static_assert(sizeof(ol_memcpy_rect_t) == sizeof(MemcpyRectTy));
+  auto AsPIDst = bit_cast<MemcpyRectTy>(DstRect);
+  auto AsPISrc = bit_cast<MemcpyRectTy>(SrcRect);
+  uint32_t AsPISize[3] = {Size.x, Size.y, Size.z};
+
+  if (DstDevice == Host)
+    return SrcDevice->Device->dataRetrieveRect(AsPIDst, AsPISrc, AsPISize,
+                                               QueueImpl);
+
+  if (SrcDevice == Host)
+    return DstDevice->Device->dataSubmitRect(AsPIDst, AsPISrc, AsPISize,
+                                             QueueImpl);
+
+  return DstDevice->Device->dataExchangeRect(AsPISrc, *DstDevice->Device,
+                                             AsPIDst, AsPISize, QueueImpl);
+}
+
 Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize,
                      const void *PatternPtr, size_t FillSize) {
   return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize,
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
index bc92f4a46a5c0..471aee954b7ab 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
@@ -59,6 +59,7 @@ DLWRAP(hsa_amd_agent_iterate_memory_pools, 3)
 DLWRAP(hsa_amd_memory_pool_allocate, 4)
 DLWRAP(hsa_amd_memory_pool_free, 1)
 DLWRAP(hsa_amd_memory_async_copy, 8)
+DLWRAP(hsa_amd_memory_async_copy_rect, 10)
 DLWRAP(hsa_amd_memory_pool_get_info, 3)
 DLWRAP(hsa_amd_agents_allow_access, 4)
 DLWRAP(hsa_amd_memory_lock, 5)
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 29cfe78082dbb..71bc8512f2f41 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -96,6 +96,26 @@ hsa_status_t hsa_amd_memory_async_copy(void *dst, hsa_agent_t dst_agent,
                                        const hsa_signal_t *dep_signals,
                                        hsa_signal_t completion_signal);
 
+enum hsa_amd_copy_direction_t {
+  hsaHostToHost = 0,
+  hsaHostToDevice = 1,
+  hsaDeviceToHost = 2,
+  hsaDeviceToDevice = 3,
+};
+
+typedef struct hsa_pitched_ptr_s {
+  void *base;
+  size_t pitch;
+  size_t slice;
+} hsa_pitched_ptr_t;
+
+hsa_status_t hsa_amd_memory_async_copy_rect(
+    const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
+    const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
+    const hsa_dim3_t *range, hsa_agent_t copy_agent,
+    hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
+    const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
+
 hsa_status_t hsa_amd_agent_memory_pool_get_info(
     hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
     hsa_amd_agent_memory_pool_info_t attribute, void *value);
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 64470e9fabf46..44a860a73e4f3 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -197,6 +197,23 @@ static Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst,
 #endif
 }
 
+/// Dispatches an asynchronous 3D/2D memory copy.
+static Error asyncMemCopyRect(MemcpyRectTy Dst, MemcpyRectTy Src,
+                              hsa_agent_t Agent, hsa_amd_copy_direction_t Dir,
+                              uint32_t Size[3], uint32_t NumDepSignals,
+                              const hsa_signal_t *DepSignals,
+                              hsa_signal_t CompletionSignal) {
+  hsa_pitched_ptr_t SrcPitched{Src.Base, Src.Pitch, Src.Slice};
+  hsa_pitched_ptr_t DstPitched{Dst.Base, Dst.Pitch, Dst.Slice};
+
+  hsa_status_t S = hsa_amd_memory_async_copy_rect(
+      &DstPitched, reinterpret_cast<hsa_dim3_t *>(Dst.Offset), &SrcPitched,
+      reinterpret_cast<hsa_dim3_t *>(Src.Offset),
+      reinterpret_cast<hsa_dim3_t *>(Size), Agent, Dir, NumDepSignals,
+      DepSignals, CompletionSignal);
+  return Plugin::check(S, "error in hsa_amd_memory_async_copy_rect: %s");
+}
+
 static Error getTargetTripleAndFeatures(hsa_agent_t Agent,
                                         SmallVector<SmallString<32>> &Targets) {
   auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
@@ -1365,6 +1382,33 @@ struct AMDGPUStreamTy {
                                    OutputSignal->get());
   }
 
+  /// Push an asynchronous 2D or 3D memory copy between pinned memory buffers.
+  Error pushPinnedMemoryCopyRectAsync(MemcpyRectTy Dst, MemcpyRectTy Src,
+                                      uint32_t CopySize[3],
+                                      hsa_amd_copy_direction_t Dir) {
+    // Retrieve an available signal for the operation's output.
+    AMDGPUSignalTy *OutputSignal = nullptr;
+    if (auto Err = SignalManager.getResource(OutputSignal))
+      return Err;
+    OutputSignal->reset();
+    OutputSignal->increaseUseCount();
+
+    std::lock_guard<std::mutex> Lock(Mutex);
+
+    // Consume stream slot and compute dependencies.
+    auto [Curr, InputSignal] = consume(OutputSignal);
+
+    // Issue the async memory copy.
+    if (InputSignal && InputSignal->load()) {
+      hsa_signal_t InputSignalRaw = InputSignal->get();
+      return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, Dir, CopySize, 1,
+                                         &InputSignalRaw, OutputSignal->get());
+    }
+
+    return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, Dir, CopySize, 0,
+                                       nullptr, OutputSignal->get());
+  }
+
   /// Push an asynchronous memory copy device-to-host involving an unpinned
   /// memory buffer. The operation consists of a two-step copy from the
   /// device buffer to an intermediate pinned host buffer, and then, to a
@@ -1539,6 +1583,37 @@ struct AMDGPUStreamTy {
                                    OutputSignal->get());
   }
 
+  Error pushMemoryCopyD2DRectAsync(MemcpyRectTy Dst, MemcpyRectTy Src,
+                                   hsa_agent_t Agent, uint32_t CopySize[3]) {
+    AMDGPUSignalTy *OutputSignal;
+    if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
+      return Err;
+    OutputSignal->reset();
+    OutputSignal->increaseUseCount();
+
+    std::lock_guard<std::mutex> Lock(Mutex);
+
+    // Consume stream slot and compute dependencies.
+    auto [Curr, InputSignal] = consume(OutputSignal);
+
+    // The agents need to have access to the corresponding memory
+    // This is presently only true if the pointers were originally
+    // allocated by this runtime or the caller made the appropriate
+    // access calls.
+
+    // TODO: Cross device transfers might not work
+
+    if (InputSignal && InputSignal->load()) {
+      hsa_signal_t InputSignalRaw = InputSignal->get();
+      return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, hsaDeviceToDevice,
+                                         CopySize, 1, &InputSignalRaw,
+                                         OutputSignal->get());
+    }
+    return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, hsaDeviceToDevice,
+                                       CopySize, 0, nullptr,
+                                       OutputSignal->get());
+  }
+
   Error pushHostCallback(void (*Callback)(void *), void *UserData) {
     // Retrieve an available signal for the operation's output.
     AMDGPUSignalTy *OutputSignal = nullptr;
@@ -2523,6 +2598,28 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                                           PinnedMemoryManager);
   }
 
+  /// 2D/3D host to device transfer.
+  Error dataSubmitRectImpl(MemcpyRectTy TgtRect, MemcpyRectTy HstRect,
+                           uint32_t Size[3],
+                           AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    AMDGPUStreamTy *Stream = nullptr;
+
+    // Use one-step asynchronous operation when host memory is already pinned.
+    if (void *PinnedPtr =
+            PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstRect.Base)) {
+      if (auto Err = getStream(AsyncInfoWrapper, Stream))
+        return Err;
+
+      HstRect.Base = PinnedPtr;
+      return Stream->pushPinnedMemoryCopyRectAsync(TgtRect, HstRect, Size,
+                                                   hsaHostToDevice);
+    }
+
+    return Plugin::error(ErrorCode::INVALID_VALUE,
+                         "AMDGPU doesn't support 2D/3D copies involving memory "
+                         "not allocated with `olMemAlloc`");
+  }
+
   /// Retrieve data from the device (device to host transfer).
   Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
                          AsyncInfoWrapperTy &AsyncInfoWrapper) override {
@@ -2583,6 +2680,27 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                                           PinnedMemoryManager);
   }
 
+  /// 2D/3D device to host transfer.
+  Error dataRetrieveRectImpl(MemcpyRectTy HstRect, MemcpyRectTy TgtRect,
+                             uint32_t Size[3],
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    AMDGPUStreamTy *Stream = nullptr;
+
+    if (void *PinnedPtr =
+            PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstRect.Base)) {
+      if (auto Err = getStream(AsyncInfoWrapper, Stream))
+        return Err;
+
+      HstRect.Base = PinnedPtr;
+      return Stream->pushPinnedMemoryCopyRectAsync(HstRect, TgtRect, Size,
+                                                   hsaDeviceToHost);
+    }
+
+    return Plugin::error(ErrorCode::INVALID_VALUE,
+                         "AMDGPU doesn't support 2D/3D copies involving memory "
+                         "not allocated with `olMemAlloc`");
+  }
+
   /// Exchange data between two devices within the plugin.
   Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
                          void *DstPtr, int64_t Size,
@@ -2620,6 +2738,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                                           getAgent(), (uint64_t)Size);
   }
 
+  /// 2D/3D device to device transfer.
+  Error dataExchangeRectImpl(MemcpyRectTy SrcRect,
+                             GenericDeviceTy &DstGenericDevice,
+                             MemcpyRectTy DstRect, uint32_t Size[3],
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
+    AMDGPUStreamTy *Stream = nullptr;
+
+    if (auto Err = getStream(AsyncInfoWrapper, Stream))
+      return Err;
+    return Stream->pushMemoryCopyD2DRectAsync(DstRect, SrcRect,
+                                              DstDevice.getAgent(), Size);
+  }
+
   /// Insert a data fence between previous data operations and the following
   /// operations. This is a no-op for AMDGPU devices as operations inserted into
   /// a queue are in-order.
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 5620437716b31..9efd6639339e0 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -921,12 +921,25 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
                                AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
+  Error dataSubmitRect(MemcpyRectTy TgtRect, const MemcpyRectTy HstRect,
+                       uint32_t Size[3], __tgt_async_info *AsyncInfo);
+  virtual Error dataSubmitRectImpl(const MemcpyRectTy TgtRect,
+                                   MemcpyRectTy HstRect, uint32_t Size[3],
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
   /// Retrieve data from the device (device to host transfer).
   Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size,
                      __tgt_async_info *AsyncInfo);
   virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
+  Error dataRetrieveRect(MemcpyRectTy HstRect, const MemcpyRectTy TgtRect,
+                         uint32_t Size[3], __tgt_async_info *AsyncInfo);
+  virtual Error dataRetrieveRectImpl(MemcpyRectTy HstRect,
+                                     const MemcpyRectTy TgtRect,
+                                     uint32_t Size[3],
+                                     AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
   /// Instert a data fence between previous data operations and the following
   /// operations if necessary for the device
   virtual Error dataFence(__tgt_async_info *AsyncInfo) = 0;
@@ -940,6 +953,15 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
                                  void *DstPtr, int64_t Size,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
+  Error dataExchangeRect(MemcpyRectTy SrcRect, GenericDeviceTy &DstDev,
+                         const MemcpyRectTy DstRect, uint32_t Size[3],
+                         __tgt_async_info *AsyncInfo);
+  virtual Error dataExchangeRectImpl(MemcpyRectTy SrcRect,
+                                     GenericDeviceTy &DstDev,
+                                     const MemcpyRectTy DstRect,
+                                     uint32_t Size[3],
+                                     AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
   /// Fill data on the device with a pattern from the host
   Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
                  int64_t Size, __tgt_async_info *AsyncInfo);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 30b5db782370d..78823674eb146 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1423,6 +1423,17 @@ Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
   return Err;
 }
 
+Error GenericDeviceTy::dataSubmitRect(MemcpyRectTy TgtRect,
+                                      const MemcpyRectTy HstRect,
+                                      uint32_t Size[3],
+                                      __tgt_async_info *AsyncInfo) {
+  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+
+  auto Err = dataSubmitRectImpl(TgtRect, HstRect, Size, AsyncInfoWrapper);
+  AsyncInfoWrapper.finalize(Err);
+  return Err;
+}
+
 Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
                                     int64_t Size, __tgt_async_info *AsyncInfo) {
   AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
@@ -1432,6 +1443,17 @@ Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
   return Err;
 }
 
+Error GenericDeviceTy::dataRetrieveRect(MemcpyRectTy HstRect,
+                                        const MemcpyRectTy TgtRect,
+                                        uint32_t Size[3],
+                                        __tgt_async_info *AsyncInfo) {
+  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+
+  auto E...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/160321