[llvm] [Offload] Add `olMemcpyRect` with amdgpu implementation (PR #160321)

Wed Sep 24 02:54:36 PDT 2025

https://github.com/RossBrunton updated https://github.com/llvm/llvm-project/pull/160321

>From 9a6d6258830a8fe5470f24dc56f7e61be147ff0f Mon Sep 17 00:00:00 2001
From: Ross Brunton <bruntonross at protonmail.com>
Date: Tue, 23 Sep 2025 16:09:15 +0100
Subject: [PATCH] [Offload] Add `olMemcpyRect` with amdgpu implementation

This is a memcpy function that copies a 2D or 3D range rather than a
linear byte count. This is an early version, and has several
limitations (some of which are temporary, some are permanent):
* Only amdgpu is supported (CUDA returns `UNSUPPORTED`).
* The queue is required.
* The pitch, slice and base address must be aligned to 4 bytes.
* All buffers must have been allocated through `olMemAlloc` (i.e., for
  the amd runtime they must be "pinned").
* Host-to-host copies are not supported.
---
 offload/include/Shared/APITypes.h             |   9 +
 offload/liboffload/API/Memory.td              |  55 ++-
 offload/liboffload/src/OffloadImpl.cpp        |  30 ++
 .../amdgpu/dynamic_hsa/hsa.cpp                |   1 +
 .../amdgpu/dynamic_hsa/hsa_ext_amd.h          |  20 +
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 132 +++++++
 .../common/include/PluginInterface.h          |  22 ++
 .../common/src/PluginInterface.cpp            |  35 ++
 offload/plugins-nextgen/cuda/src/rtl.cpp      |  21 +
 offload/plugins-nextgen/host/src/rtl.cpp      |  21 +
 offload/unittests/OffloadAPI/CMakeLists.txt   |   3 +-
 .../OffloadAPI/memory/olMemcpyRect.cpp        | 358 ++++++++++++++++++
 12 files changed, 705 insertions(+), 2 deletions(-)
 create mode 100644 offload/unittests/OffloadAPI/memory/olMemcpyRect.cpp

diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 8c150b6bfc2d4..e2387900740a5 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -126,6 +126,15 @@ struct KernelLaunchParamsTy {
   /// Ptrs to the Data entries. Only strictly required for the host plugin.
   void **Ptrs = nullptr;
 };
+
+/// Rectangular range for rect memcpies. Should be the same layout as
+/// liboffload's `ol_memcpy_rect_t`.
+struct MemcpyRectTy {
+  void *Base;
+  uint32_t Offset[3];
+  size_t Pitch;
+  size_t Slice;
+};
 }
 
 #endif // OMPTARGET_SHARED_API_TYPES_H
diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td
index debda165d2b23..4bdaa1aa73a26 100644
--- a/offload/liboffload/API/Memory.td
+++ b/offload/liboffload/API/Memory.td
@@ -22,7 +22,8 @@ def ol_alloc_type_t : Enum {
 def olMemAlloc : Function {
   let desc = "Creates a memory allocation on the specified device.";
   let details = [
-      "All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory."
+      "All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory.",
+      "The returned memory allocation will be aligned at least to a 4 byte boundry.",
   ];
   let params = [
     Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>,
@@ -63,6 +64,58 @@ def olMemcpy : Function {
     let returns = [];
 }
 
+def ol_memcpy_rect_t : Struct {
+  let desc = "A 3D view into a buffer for `olMemcpyRect`";
+  let members = [
+    StructMember<"void*", "buffer", "the buffer backing this range">,
+    StructMember<"ol_dimensions_t", "offset", "byte coordinate offset into the space">,
+    StructMember<"size_t", "pitch", "the pitch of the buffer in bytes (i.e. how large each `x` row is)">,
+    StructMember<"size_t", "slice", "the slice of the buffer in bytes (i.e. how large each `pitch * y` plane is)">,
+  ];
+}
+
+def olMemcpyRect : Function {
+    let desc = "Enqueue a 2D or 3D memcpy operation.";
+    let details = [
+        "For host pointers, use the host device belonging to the OL_PLATFORM_BACKEND_HOST platform.",
+        "If a queue is specified, at least one device must be a non-host device",
+        "For both the source and destination, the base pointer, pitch and slice must all be aligned to 4 bytes",
+        "For 2D copies (where `Size.z` is 1), the slice value is ignored",
+        "Both the source and destination must have been allocated via `olMemAlloc`",
+        "Either the source or destination (or both) must have a non-host device",
+        "If a queue is not specified, the memcpy happens synchronously",
+    ];
+    let params = [
+        Param<"ol_queue_handle_t", "Queue", "handle of the queue.", PARAM_IN_OPTIONAL>,
+        Param<"ol_memcpy_rect_t", "DstRect", "pointer to copy to", PARAM_IN>,
+        Param<"ol_device_handle_t", "DstDevice", "device that DstPtr belongs to", PARAM_IN>,
+        Param<"ol_memcpy_rect_t", "SrcRect", "pointer to copy from", PARAM_IN>,
+        Param<"ol_device_handle_t", "SrcDevice", "device that SrcPtr belongs to", PARAM_IN>,
+        Param<"ol_dimensions_t", "Size", "size in bytes of data to copy", PARAM_IN>,
+    ];
+    let returns = [
+      Return<"OL_ERRC_INVALID_SIZE", [
+        "`DstRect.pitch % 4 > 0`",
+        "`DstRect.slice % 4 > 0`",
+        "`(uintptr_t)DstRect.buffer % 4 > 0`",
+        "`SrcRect.pitch % 4 > 0`",
+        "`SrcRect.slice % 4 > 0`",
+        "`(uintptr_t)SrcRect.buffer % 4 > 0`",
+        "`Size.x == 0 || Size.y == 0 || Size.z == 0`",
+      ]>,
+      Return<"OL_ERRC_INVALID_NULL_POINTER", [
+        "`DstRect.buffer == NULL`",
+        "`SrcRect.buffer == NULL`",
+      ]>,
+      Return<"OL_ERRC_INVALID_VALUE", [
+        "Either the source or destination was not allocated via `olMemAlloc`",
+      ]>,
+      Return<"OL_ERRC_INVALID_ARGUMENT", [
+        "Both arguments are the host device",
+      ]>
+    ];
+}
+
 def olMemFill : Function {
   let desc = "Fill memory with copies of the given pattern";
   let details = [
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 5457fc50b9711..327cc8ebd9223 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -927,6 +927,36 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
   return Error::success();
 }
 
+Error olMemcpyRect_impl(ol_queue_handle_t Queue, ol_memcpy_rect_t DstRect,
+                        ol_device_handle_t DstDevice, ol_memcpy_rect_t SrcRect,
+                        ol_device_handle_t SrcDevice, ol_dimensions_t Size) {
+  auto Host = OffloadContext::get().HostDevice();
+  if (DstDevice == Host && SrcDevice == Host) {
+    return createOffloadError(
+        ErrorCode::INVALID_ARGUMENT,
+        "one of DstDevice and SrcDevice must be a non-host device");
+  }
+
+  // If no queue is given the memcpy will be synchronous
+  auto QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
+
+  static_assert(sizeof(ol_memcpy_rect_t) == sizeof(MemcpyRectTy));
+  auto AsPIDst = bit_cast<MemcpyRectTy>(DstRect);
+  auto AsPISrc = bit_cast<MemcpyRectTy>(SrcRect);
+  uint32_t AsPISize[3] = {Size.x, Size.y, Size.z};
+
+  if (DstDevice == Host)
+    return SrcDevice->Device->dataRetrieveRect(AsPIDst, AsPISrc, AsPISize,
+                                               QueueImpl);
+
+  if (SrcDevice == Host)
+    return DstDevice->Device->dataSubmitRect(AsPIDst, AsPISrc, AsPISize,
+                                             QueueImpl);
+
+  return DstDevice->Device->dataExchangeRect(AsPISrc, *DstDevice->Device,
+                                             AsPIDst, AsPISize, QueueImpl);
+}
+
 Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize,
                      const void *PatternPtr, size_t FillSize) {
   return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize,
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
index bc92f4a46a5c0..471aee954b7ab 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
@@ -59,6 +59,7 @@ DLWRAP(hsa_amd_agent_iterate_memory_pools, 3)
 DLWRAP(hsa_amd_memory_pool_allocate, 4)
 DLWRAP(hsa_amd_memory_pool_free, 1)
 DLWRAP(hsa_amd_memory_async_copy, 8)
+DLWRAP(hsa_amd_memory_async_copy_rect, 10)
 DLWRAP(hsa_amd_memory_pool_get_info, 3)
 DLWRAP(hsa_amd_agents_allow_access, 4)
 DLWRAP(hsa_amd_memory_lock, 5)
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 29cfe78082dbb..71bc8512f2f41 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -96,6 +96,26 @@ hsa_status_t hsa_amd_memory_async_copy(void *dst, hsa_agent_t dst_agent,
                                        const hsa_signal_t *dep_signals,
                                        hsa_signal_t completion_signal);
 
+enum hsa_amd_copy_direction_t {
+  hsaHostToHost = 0,
+  hsaHostToDevice = 1,
+  hsaDeviceToHost = 2,
+  hsaDeviceToDevice = 3,
+};
+
+typedef struct hsa_pitched_ptr_s {
+  void *base;
+  size_t pitch;
+  size_t slice;
+} hsa_pitched_ptr_t;
+
+hsa_status_t hsa_amd_memory_async_copy_rect(
+    const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
+    const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
+    const hsa_dim3_t *range, hsa_agent_t copy_agent,
+    hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
+    const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
+
 hsa_status_t hsa_amd_agent_memory_pool_get_info(
     hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
     hsa_amd_agent_memory_pool_info_t attribute, void *value);
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 64470e9fabf46..44a860a73e4f3 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -197,6 +197,23 @@ static Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst,
 #endif
 }
 
+/// Dispatches an asynchronous 3D/2D memory copy.
+static Error asyncMemCopyRect(MemcpyRectTy Dst, MemcpyRectTy Src,
+                              hsa_agent_t Agent, hsa_amd_copy_direction_t Dir,
+                              uint32_t Size[3], uint32_t NumDepSignals,
+                              const hsa_signal_t *DepSignals,
+                              hsa_signal_t CompletionSignal) {
+  hsa_pitched_ptr_t SrcPitched{Src.Base, Src.Pitch, Src.Slice};
+  hsa_pitched_ptr_t DstPitched{Dst.Base, Dst.Pitch, Dst.Slice};
+
+  hsa_status_t S = hsa_amd_memory_async_copy_rect(
+      &DstPitched, reinterpret_cast<hsa_dim3_t *>(Dst.Offset), &SrcPitched,
+      reinterpret_cast<hsa_dim3_t *>(Src.Offset),
+      reinterpret_cast<hsa_dim3_t *>(Size), Agent, Dir, NumDepSignals,
+      DepSignals, CompletionSignal);
+  return Plugin::check(S, "error in hsa_amd_memory_async_copy_rect: %s");
+}
+
 static Error getTargetTripleAndFeatures(hsa_agent_t Agent,
                                         SmallVector<SmallString<32>> &Targets) {
   auto Err = hsa_utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
@@ -1365,6 +1382,33 @@ struct AMDGPUStreamTy {
                                    OutputSignal->get());
   }
 
+  /// Push an asynchronous 2D or 3D memory copy between pinned memory buffers.
+  Error pushPinnedMemoryCopyRectAsync(MemcpyRectTy Dst, MemcpyRectTy Src,
+                                      uint32_t CopySize[3],
+                                      hsa_amd_copy_direction_t Dir) {
+    // Retrieve an available signal for the operation's output.
+    AMDGPUSignalTy *OutputSignal = nullptr;
+    if (auto Err = SignalManager.getResource(OutputSignal))
+      return Err;
+    OutputSignal->reset();
+    OutputSignal->increaseUseCount();
+
+    std::lock_guard<std::mutex> Lock(Mutex);
+
+    // Consume stream slot and compute dependencies.
+    auto [Curr, InputSignal] = consume(OutputSignal);
+
+    // Issue the async memory copy.
+    if (InputSignal && InputSignal->load()) {
+      hsa_signal_t InputSignalRaw = InputSignal->get();
+      return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, Dir, CopySize, 1,
+                                         &InputSignalRaw, OutputSignal->get());
+    }
+
+    return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, Dir, CopySize, 0,
+                                       nullptr, OutputSignal->get());
+  }
+
   /// Push an asynchronous memory copy device-to-host involving an unpinned
   /// memory buffer. The operation consists of a two-step copy from the
   /// device buffer to an intermediate pinned host buffer, and then, to a
@@ -1539,6 +1583,37 @@ struct AMDGPUStreamTy {
                                    OutputSignal->get());
   }
 
+  Error pushMemoryCopyD2DRectAsync(MemcpyRectTy Dst, MemcpyRectTy Src,
+                                   hsa_agent_t Agent, uint32_t CopySize[3]) {
+    AMDGPUSignalTy *OutputSignal;
+    if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
+      return Err;
+    OutputSignal->reset();
+    OutputSignal->increaseUseCount();
+
+    std::lock_guard<std::mutex> Lock(Mutex);
+
+    // Consume stream slot and compute dependencies.
+    auto [Curr, InputSignal] = consume(OutputSignal);
+
+    // The agents need to have access to the corresponding memory
+    // This is presently only true if the pointers were originally
+    // allocated by this runtime or the caller made the appropriate
+    // access calls.
+
+    // TODO: Cross device transfers might not work
+
+    if (InputSignal && InputSignal->load()) {
+      hsa_signal_t InputSignalRaw = InputSignal->get();
+      return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, hsaDeviceToDevice,
+                                         CopySize, 1, &InputSignalRaw,
+                                         OutputSignal->get());
+    }
+    return hsa_utils::asyncMemCopyRect(Dst, Src, Agent, hsaDeviceToDevice,
+                                       CopySize, 0, nullptr,
+                                       OutputSignal->get());
+  }
+
   Error pushHostCallback(void (*Callback)(void *), void *UserData) {
     // Retrieve an available signal for the operation's output.
     AMDGPUSignalTy *OutputSignal = nullptr;
@@ -2523,6 +2598,28 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                                           PinnedMemoryManager);
   }
 
+  /// 2D/3D host to device transfer.
+  Error dataSubmitRectImpl(MemcpyRectTy TgtRect, MemcpyRectTy HstRect,
+                           uint32_t Size[3],
+                           AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    AMDGPUStreamTy *Stream = nullptr;
+
+    // Use one-step asynchronous operation when host memory is already pinned.
+    if (void *PinnedPtr =
+            PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstRect.Base)) {
+      if (auto Err = getStream(AsyncInfoWrapper, Stream))
+        return Err;
+
+      HstRect.Base = PinnedPtr;
+      return Stream->pushPinnedMemoryCopyRectAsync(TgtRect, HstRect, Size,
+                                                   hsaHostToDevice);
+    }
+
+    return Plugin::error(ErrorCode::INVALID_VALUE,
+                         "AMDGPU doesn't support 2D/3D copies involving memory "
+                         "not allocated with `olMemAlloc`");
+  }
+
   /// Retrieve data from the device (device to host transfer).
   Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
                          AsyncInfoWrapperTy &AsyncInfoWrapper) override {
@@ -2583,6 +2680,27 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                                           PinnedMemoryManager);
   }
 
+  /// 2D/3D device to host transfer.
+  Error dataRetrieveRectImpl(MemcpyRectTy HstRect, MemcpyRectTy TgtRect,
+                             uint32_t Size[3],
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    AMDGPUStreamTy *Stream = nullptr;
+
+    if (void *PinnedPtr =
+            PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstRect.Base)) {
+      if (auto Err = getStream(AsyncInfoWrapper, Stream))
+        return Err;
+
+      HstRect.Base = PinnedPtr;
+      return Stream->pushPinnedMemoryCopyRectAsync(HstRect, TgtRect, Size,
+                                                   hsaDeviceToHost);
+    }
+
+    return Plugin::error(ErrorCode::INVALID_VALUE,
+                         "AMDGPU doesn't support 2D/3D copies involving memory "
+                         "not allocated with `olMemAlloc`");
+  }
+
   /// Exchange data between two devices within the plugin.
   Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
                          void *DstPtr, int64_t Size,
@@ -2620,6 +2738,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                                           getAgent(), (uint64_t)Size);
   }
 
+  /// 2D/3D device to device transfer.
+  Error dataExchangeRectImpl(MemcpyRectTy SrcRect,
+                             GenericDeviceTy &DstGenericDevice,
+                             MemcpyRectTy DstRect, uint32_t Size[3],
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
+    AMDGPUStreamTy *Stream = nullptr;
+
+    if (auto Err = getStream(AsyncInfoWrapper, Stream))
+      return Err;
+    return Stream->pushMemoryCopyD2DRectAsync(DstRect, SrcRect,
+                                              DstDevice.getAgent(), Size);
+  }
+
   /// Insert a data fence between previous data operations and the following
   /// operations. This is a no-op for AMDGPU devices as operations inserted into
   /// a queue are in-order.
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 5620437716b31..9efd6639339e0 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -921,12 +921,25 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
                                AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
+  Error dataSubmitRect(MemcpyRectTy TgtRect, const MemcpyRectTy HstRect,
+                       uint32_t Size[3], __tgt_async_info *AsyncInfo);
+  virtual Error dataSubmitRectImpl(const MemcpyRectTy TgtRect,
+                                   MemcpyRectTy HstRect, uint32_t Size[3],
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
   /// Retrieve data from the device (device to host transfer).
   Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size,
                      __tgt_async_info *AsyncInfo);
   virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
+  Error dataRetrieveRect(MemcpyRectTy HstRect, const MemcpyRectTy TgtRect,
+                         uint32_t Size[3], __tgt_async_info *AsyncInfo);
+  virtual Error dataRetrieveRectImpl(MemcpyRectTy HstRect,
+                                     const MemcpyRectTy TgtRect,
+                                     uint32_t Size[3],
+                                     AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
   /// Instert a data fence between previous data operations and the following
   /// operations if necessary for the device
   virtual Error dataFence(__tgt_async_info *AsyncInfo) = 0;
@@ -940,6 +953,15 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
                                  void *DstPtr, int64_t Size,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
+  Error dataExchangeRect(MemcpyRectTy SrcRect, GenericDeviceTy &DstDev,
+                         const MemcpyRectTy DstRect, uint32_t Size[3],
+                         __tgt_async_info *AsyncInfo);
+  virtual Error dataExchangeRectImpl(MemcpyRectTy SrcRect,
+                                     GenericDeviceTy &DstDev,
+                                     const MemcpyRectTy DstRect,
+                                     uint32_t Size[3],
+                                     AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
   /// Fill data on the device with a pattern from the host
   Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
                  int64_t Size, __tgt_async_info *AsyncInfo);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 30b5db782370d..78823674eb146 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1423,6 +1423,17 @@ Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
   return Err;
 }
 
+Error GenericDeviceTy::dataSubmitRect(MemcpyRectTy TgtRect,
+                                      const MemcpyRectTy HstRect,
+                                      uint32_t Size[3],
+                                      __tgt_async_info *AsyncInfo) {
+  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+
+  auto Err = dataSubmitRectImpl(TgtRect, HstRect, Size, AsyncInfoWrapper);
+  AsyncInfoWrapper.finalize(Err);
+  return Err;
+}
+
 Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
                                     int64_t Size, __tgt_async_info *AsyncInfo) {
   AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
@@ -1432,6 +1443,17 @@ Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
   return Err;
 }
 
+Error GenericDeviceTy::dataRetrieveRect(MemcpyRectTy HstRect,
+                                        const MemcpyRectTy TgtRect,
+                                        uint32_t Size[3],
+                                        __tgt_async_info *AsyncInfo) {
+  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+
+  auto Err = dataRetrieveRectImpl(HstRect, TgtRect, Size, AsyncInfoWrapper);
+  AsyncInfoWrapper.finalize(Err);
+  return Err;
+}
+
 Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
                                     void *DstPtr, int64_t Size,
                                     __tgt_async_info *AsyncInfo) {
@@ -1442,6 +1464,19 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
   return Err;
 }
 
+Error GenericDeviceTy::dataExchangeRect(MemcpyRectTy SrcRect,
+                                        GenericDeviceTy &DstDev,
+                                        const MemcpyRectTy DstRect,
+                                        uint32_t Size[3],
+                                        __tgt_async_info *AsyncInfo) {
+  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+
+  auto Err =
+      dataExchangeRectImpl(SrcRect, DstDev, DstRect, Size, AsyncInfoWrapper);
+  AsyncInfoWrapper.finalize(Err);
+  return Err;
+}
+
 Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr,
                                 int64_t PatternSize, int64_t Size,
                                 __tgt_async_info *AsyncInfo) {
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b2f840113cff3..0448b9ba9873d 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -814,6 +814,13 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::check(Res, "error in cuMemcpyHtoDAsync: %s");
   }
 
+  Error dataSubmitRectImpl(MemcpyRectTy TgtRect, MemcpyRectTy HstRect,
+                           uint32_t Size[3],
+                           AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::error(ErrorCode::UNIMPLEMENTED,
+                         "CUDA does not yet support 2D/3D copies");
+  }
+
   /// Retrieve data from the device (device to host transfer).
   Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
                          AsyncInfoWrapperTy &AsyncInfoWrapper) override {
@@ -828,12 +835,26 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::check(Res, "error in cuMemcpyDtoHAsync: %s");
   }
 
+  Error dataRetrieveRectImpl(MemcpyRectTy HstRect, MemcpyRectTy TgtRect,
+                             uint32_t Size[3],
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::error(ErrorCode::UNIMPLEMENTED,
+                         "CUDA does not yet support 2D/3D copies");
+  }
+
   /// Exchange data between two devices directly. We may use peer access if
   /// the CUDA devices and driver allow them.
   Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
                          void *DstPtr, int64_t Size,
                          AsyncInfoWrapperTy &AsyncInfoWrapper) override;
 
+  Error dataExchangeRectImpl(MemcpyRectTy SrcRect, GenericDeviceTy &Dst,
+                             MemcpyRectTy DstRect, uint32_t Size[3],
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::error(ErrorCode::UNIMPLEMENTED,
+                         "CUDA does not yet support 2D/3D copies");
+  }
+
   Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
                      int64_t Size,
                      AsyncInfoWrapperTy &AsyncInfoWrapper) override {
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 44e2584fe53cc..66f4de83e9275 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -285,6 +285,13 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     return Plugin::success();
   }
 
+  Error dataSubmitRectImpl(MemcpyRectTy TgtRect, MemcpyRectTy HstRect,
+                           uint32_t Size[3],
+                           AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::error(ErrorCode::UNIMPLEMENTED,
+                         "Host does not yet support 2D/3D copies");
+  }
+
   /// Retrieve data from the device (device to host transfer).
   Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
                          AsyncInfoWrapperTy &AsyncInfoWrapper) override {
@@ -292,6 +299,13 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     return Plugin::success();
   }
 
+  Error dataRetrieveRectImpl(MemcpyRectTy HstRect, MemcpyRectTy TgtRect,
+                             uint32_t Size[3],
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::error(ErrorCode::UNIMPLEMENTED,
+                         "Host does not yet support 2D/3D copies");
+  }
+
   /// Exchange data between two devices within the plugin. This function is not
   /// supported in this plugin.
   Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
@@ -303,6 +317,13 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
                          "dataExchangeImpl not supported");
   }
 
+  Error dataExchangeRectImpl(MemcpyRectTy SrcRect, GenericDeviceTy &Dst,
+                             MemcpyRectTy DstRect, uint32_t Size[3],
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::error(ErrorCode::UNIMPLEMENTED,
+                         "Host does not yet support 2D/3D copies");
+  }
+
   /// Insert a data fence between previous data operations and the following
   /// operations. This is a no-op for Host devices as operations inserted into
   /// a queue are in-order.
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index ba35c1ee87aac..ae3abba27801f 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -27,7 +27,8 @@ add_offload_unittest("memory"
     memory/olMemAlloc.cpp
     memory/olMemFill.cpp
     memory/olMemFree.cpp
-    memory/olMemcpy.cpp)
+    memory/olMemcpy.cpp
+    memory/olMemcpyRect.cpp)
 
 add_offload_unittest("platform"
     platform/olGetPlatformInfo.cpp
diff --git a/offload/unittests/OffloadAPI/memory/olMemcpyRect.cpp b/offload/unittests/OffloadAPI/memory/olMemcpyRect.cpp
new file mode 100644
index 0000000000000..0118383d52d0b
--- /dev/null
+++ b/offload/unittests/OffloadAPI/memory/olMemcpyRect.cpp
@@ -0,0 +1,358 @@
+//===------- Offload API tests - olMemcpyRect ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+constexpr ol_dimensions_t FULL_SIZE = {16, 8, 4};
+constexpr size_t BYTES = FULL_SIZE.x * FULL_SIZE.y * FULL_SIZE.z;
+
+constexpr ol_dimensions_t COPY_SIZE = {4, 3, 2};
+constexpr ol_dimensions_t COPY_OFFSET = {8, 2, 1};
+
+struct olMemcpyRectTest : OffloadQueueTest {
+  void SetUp() override {
+    RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
+
+    ol_platform_handle_t Platform;
+    ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_PLATFORM,
+                                   sizeof(Platform), &Platform));
+    ol_platform_backend_t Backend;
+    ASSERT_SUCCESS(olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND,
+                                     sizeof(Backend), &Backend));
+    if (Backend == OL_PLATFORM_BACKEND_CUDA)
+      GTEST_SKIP() << "CUDA does not yet support this entry point\n";
+
+    Buff.fill('h');
+    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_HOST, BYTES, &HostPtr));
+    ASSERT_SUCCESS(
+        olMemcpy(nullptr, HostPtr, Device, Buff.data(), Host, BYTES));
+
+    Buff.fill('d');
+    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, BYTES, &DevicePtr));
+    ASSERT_SUCCESS(
+        olMemcpy(nullptr, DevicePtr, Device, Buff.data(), Host, BYTES));
+
+    Buff.fill('D');
+    ASSERT_SUCCESS(
+        olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, BYTES, &DevicePtr2));
+    ASSERT_SUCCESS(
+        olMemcpy(nullptr, DevicePtr2, Device, Buff.data(), Host, BYTES));
+
+    SrcRect.offset = DstRect.offset = COPY_OFFSET;
+    SrcRect.pitch = DstRect.pitch = FULL_SIZE.x;
+    SrcRect.slice = DstRect.slice = FULL_SIZE.y * FULL_SIZE.x;
+  }
+
+  void TearDown() override {
+    ASSERT_SUCCESS(olMemFree(HostPtr));
+    ASSERT_SUCCESS(olMemFree(DevicePtr));
+  }
+
+  void checkPattern(void *CheckBuffer, const char *Template) {
+    ASSERT_SUCCESS(
+        olMemcpy(nullptr, Buff.data(), Host, CheckBuffer, Device, BYTES));
+    bool Failed = false;
+
+    for (size_t I = 0; I < BYTES; I++) {
+      if (Buff[I] != Template[I]) {
+        ADD_FAILURE() << "Failure at location " << I << "\n";
+        Failed = true;
+        break;
+      }
+    }
+
+    if (Failed) {
+      std::cerr << "Expected:\n";
+      printSlices([&](size_t I) -> char { return Template[I]; });
+      std::cerr << "Got:\n";
+      printSlices([&](size_t I) -> char { return Buff[I]; });
+      std::cerr << "Delta:\n";
+      printSlices(
+          [&](size_t I) -> char { return Buff[I] == Template[I] ? '.' : 'X'; });
+    }
+  }
+
+  template <typename F> void printSlices(F Getter) {
+    for (size_t Y = 0; Y < FULL_SIZE.y; Y++) {
+      for (size_t Z = 0; Z < FULL_SIZE.z; Z++) {
+        for (size_t X = 0; X < FULL_SIZE.x; X++) {
+          std::cerr << Getter(X + (Y * FULL_SIZE.x) +
+                              (Z * FULL_SIZE.y * FULL_SIZE.x));
+        }
+        std::cerr << "    ";
+      }
+
+      std::cerr << "\n";
+    }
+  }
+
+  std::array<uint8_t, BYTES> Buff;
+  void *HostPtr;
+  void *DevicePtr;
+  void *DevicePtr2;
+  ol_memcpy_rect_t SrcRect;
+  ol_memcpy_rect_t DstRect;
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemcpyRectTest);
+
+TEST_P(olMemcpyRectTest, SuccessHtoD) {
+  DstRect.buffer = DevicePtr;
+  SrcRect.buffer = HostPtr;
+
+  ASSERT_SUCCESS(
+      olMemcpyRect(Queue, DstRect, Device, SrcRect, Host, COPY_SIZE));
+  ASSERT_SUCCESS(olSyncQueue(Queue));
+
+  // clang-format off
+  checkPattern(DevicePtr,
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "ddddddddhhhhdddd"
+    "ddddddddhhhhdddd"
+    "ddddddddhhhhdddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "ddddddddhhhhdddd"
+    "ddddddddhhhhdddd"
+    "ddddddddhhhhdddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+  );
+  // clang-format on
+}
+
+TEST_P(olMemcpyRectTest, SuccessDtoH) {
+  DstRect.buffer = HostPtr;
+  SrcRect.buffer = DevicePtr;
+
+  ASSERT_SUCCESS(
+      olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+  ASSERT_SUCCESS(olSyncQueue(Queue));
+
+  // clang-format off
+  checkPattern(HostPtr,
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhddddhhhh"
+    "hhhhhhhhddddhhhh"
+    "hhhhhhhhddddhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhddddhhhh"
+    "hhhhhhhhddddhhhh"
+    "hhhhhhhhddddhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+    "hhhhhhhhhhhhhhhh"
+  );
+  // clang-format on
+}
+
+TEST_P(olMemcpyRectTest, SuccessDtoD) {
+  DstRect.buffer = DevicePtr;
+  SrcRect.buffer = DevicePtr2;
+
+  ASSERT_SUCCESS(
+      olMemcpyRect(Queue, DstRect, Device, SrcRect, Device, COPY_SIZE));
+  ASSERT_SUCCESS(olSyncQueue(Queue));
+
+  // clang-format off
+  checkPattern(DevicePtr,
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "ddddddddDDDDdddd"
+    "ddddddddDDDDdddd"
+    "ddddddddDDDDdddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "ddddddddDDDDdddd"
+    "ddddddddDDDDdddd"
+    "ddddddddDDDDdddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+    "dddddddddddddddd"
+  );
+  // clang-format on
+}
+
+TEST_P(olMemcpyRectTest, InvalidDstPtr) {
+  DstRect.buffer = nullptr;
+  SrcRect.buffer = HostPtr;
+
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olMemcpyRect(Queue, DstRect, Device, SrcRect, Host, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidSrcPtr) {
+  DstRect.buffer = HostPtr;
+  SrcRect.buffer = nullptr;
+
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olMemcpyRect(Queue, DstRect, Device, SrcRect, Host, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidDstDevice) {
+  DstRect.buffer = HostPtr;
+  SrcRect.buffer = DevicePtr;
+
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olMemcpyRect(Queue, DstRect, nullptr, SrcRect, Host, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidSrcDevice) {
+  DstRect.buffer = HostPtr;
+  SrcRect.buffer = DevicePtr;
+
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olMemcpyRect(Queue, DstRect, Host, SrcRect, nullptr, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidSize) {
+  DstRect.buffer = HostPtr;
+  SrcRect.buffer = DevicePtr;
+
+  ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+               olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, {0, 0, 0}));
+}
+
+TEST_P(olMemcpyRectTest, InvalidSrcPtrAlign) {
+  DstRect.buffer = HostPtr;
+  SrcRect.buffer = &static_cast<char *>(DevicePtr)[2];
+
+  ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+               olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidDstPtrAlign) {
+  DstRect.buffer = &static_cast<char *>(HostPtr)[2];
+  SrcRect.buffer = DevicePtr;
+
+  ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+               olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidDstPitchAlign) {
+  DstRect.buffer = HostPtr;
+  DstRect.pitch = 2;
+  SrcRect.buffer = DevicePtr;
+
+  ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+               olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidSrcPitchAlign) {
+  DstRect.buffer = HostPtr;
+  SrcRect.buffer = DevicePtr;
+  SrcRect.pitch = 2;
+
+  ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+               olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidDstSliceAlign) {
+  DstRect.buffer = HostPtr;
+  DstRect.slice = 2;
+  SrcRect.buffer = DevicePtr;
+
+  ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+               olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidSrcSliceAlign) {
+  DstRect.buffer = HostPtr;
+  SrcRect.buffer = DevicePtr;
+  SrcRect.slice = 2;
+
+  ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+               olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidDstUnalloc) {
+  DstRect.buffer = Buff.data();
+  SrcRect.buffer = DevicePtr;
+
+  ASSERT_ERROR(OL_ERRC_INVALID_VALUE,
+               olMemcpyRect(Queue, DstRect, Host, SrcRect, Device, COPY_SIZE));
+}
+
+TEST_P(olMemcpyRectTest, InvalidSrcUnalloc) {
+  DstRect.buffer = DevicePtr;
+  SrcRect.buffer = Buff.data();
+
+  ASSERT_ERROR(OL_ERRC_INVALID_VALUE,
+               olMemcpyRect(Queue, DstRect, Device, SrcRect, Host, COPY_SIZE));
+}