[llvm] [Offload] Implement the remaining initial Offload API (PR #122106)

Callum Fare via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 21 03:46:39 PST 2025


https://github.com/callumfare updated https://github.com/llvm/llvm-project/pull/122106

>From 7cbe788ddc0de682ce0f939caf4619e99889f992 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Wed, 11 Dec 2024 12:08:44 +0000
Subject: [PATCH 01/17] WIP: Implement olMemAlloc, olMemFree

---
 offload/liboffload/API/Memory.td              | 45 +++++++++
 offload/liboffload/API/OffloadAPI.td          |  1 +
 .../liboffload/include/generated/OffloadAPI.h | 95 +++++++++++++++++++
 .../include/generated/OffloadEntryPoints.inc  | 93 ++++++++++++++++++
 .../include/generated/OffloadFuncs.inc        |  4 +
 .../generated/OffloadImplFuncDecls.inc        |  7 ++
 .../include/generated/OffloadPrint.hpp        | 53 +++++++++++
 offload/liboffload/src/OffloadImpl.cpp        | 35 +++++++
 8 files changed, 333 insertions(+)
 create mode 100644 offload/liboffload/API/Memory.td

diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td
new file mode 100644
index 0000000000000..8cfaf70311e34
--- /dev/null
+++ b/offload/liboffload/API/Memory.td
@@ -0,0 +1,45 @@
+//===-- Memory.td - Memory definitions for Offload ---------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains Offload API definitions related to memory allocations
+//
+//===----------------------------------------------------------------------===//
+
+def : Enum {
+  let name = "ol_alloc_type_t";
+  let desc = "Represents the type of allocation made with olMemAlloc";
+  let etors = [
+    Etor<"HOST", "Host allocation">,
+    Etor<"DEVICE", "Device allocation">,
+    Etor<"SHARED", "Shared allocation">
+  ];
+}
+
+def : Function {
+  let name = "olMemAlloc";
+  let desc = "Creates a memory allocation on the specified device";
+  let params = [
+    Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>,
+    Param<"ol_alloc_type_t", "Type", "type of the allocation", PARAM_IN>,
+    Param<"size_t", "Size", "size of the allocation in bytes", PARAM_IN>,
+    Param<"size_t", "Aligment", "alignment of the allocation in bytes", PARAM_IN>,
+    Param<"void**", "AllocationOut", "output for the allocated pointer", PARAM_OUT>
+  ];
+  let returns = [];
+}
+
+def : Function {
+  let name = "olMemFree";
+  let desc = "Frees a memory allocation previously made by olMemAlloc";
+  let params = [
+    Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>,
+    Param<"ol_alloc_type_t", "Type", "type of the allocation", PARAM_IN>,
+    Param<"void*", "Address", "address of the allocation to free", PARAM_IN>,
+  ];
+  let returns = [];
+}
diff --git a/offload/liboffload/API/OffloadAPI.td b/offload/liboffload/API/OffloadAPI.td
index 8a0c3c4058122..a609cc7ac80b4 100644
--- a/offload/liboffload/API/OffloadAPI.td
+++ b/offload/liboffload/API/OffloadAPI.td
@@ -13,3 +13,4 @@ include "APIDefs.td"
 include "Common.td"
 include "Platform.td"
 include "Device.td"
+include "Memory.td"
diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index 11fcc96625ab8..81f3a8e0201ba 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -460,6 +460,67 @@ OL_APIEXPORT ol_result_t OL_APICALL olGetDeviceInfoSize(
     // [out] pointer to the number of bytes required to store the query
     size_t *PropSizeRet);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Represents the type of allocation made with olMemAlloc
+typedef enum ol_alloc_type_t {
+  /// Host allocation
+  OL_ALLOC_TYPE_HOST = 0,
+  /// Device allocation
+  OL_ALLOC_TYPE_DEVICE = 1,
+  /// Shared allocation
+  OL_ALLOC_TYPE_SHARED = 2,
+  /// @cond
+  OL_ALLOC_TYPE_FORCE_UINT32 = 0x7fffffff
+  /// @endcond
+
+} ol_alloc_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a memory allocation on the specified device
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Device`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == AllocationOut`
+OL_APIEXPORT ol_result_t OL_APICALL olMemAlloc(
+    // [in] handle of the device to allocate on
+    ol_device_handle_t Device,
+    // [in] type of the allocation
+    ol_alloc_type_t Type,
+    // [in] size of the allocation in bytes
+    size_t Size,
+    // [in] alignment of the allocation in bytes
+    size_t Aligment,
+    // [out] output for the allocated pointer
+    void **AllocationOut);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Frees a memory allocation previously made by olMemAlloc
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Device`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == Address`
+OL_APIEXPORT ol_result_t OL_APICALL olMemFree(
+    // [in] handle of the device to allocate on
+    ol_device_handle_t Device,
+    // [in] type of the allocation
+    ol_alloc_type_t Type,
+    // [in] address of the allocation to free
+    void *Address);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for olGetPlatform
 /// @details Each entry is a pointer to the parameter passed to the function;
@@ -530,6 +591,26 @@ typedef struct ol_get_device_info_size_params_t {
   size_t **pPropSizeRet;
 } ol_get_device_info_size_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olMemAlloc
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_mem_alloc_params_t {
+  ol_device_handle_t *pDevice;
+  ol_alloc_type_t *pType;
+  size_t *pSize;
+  size_t *pAligment;
+  void ***pAllocationOut;
+} ol_mem_alloc_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olMemFree
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_mem_free_params_t {
+  ol_device_handle_t *pDevice;
+  ol_alloc_type_t *pType;
+  void **pAddress;
+} ol_mem_free_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Variant of olInit that also sets source code location information
 /// @details See also ::olInit
@@ -605,6 +686,20 @@ OL_APIEXPORT ol_result_t OL_APICALL olGetDeviceInfoSizeWithCodeLoc(
     ol_device_handle_t Device, ol_device_info_t PropName, size_t *PropSizeRet,
     ol_code_location_t *CodeLocation);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olMemAlloc that also sets source code location information
+/// @details See also ::olMemAlloc
+OL_APIEXPORT ol_result_t OL_APICALL olMemAllocWithCodeLoc(
+    ol_device_handle_t Device, ol_alloc_type_t Type, size_t Size,
+    size_t Aligment, void **AllocationOut, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olMemFree that also sets source code location information
+/// @details See also ::olMemFree
+OL_APIEXPORT ol_result_t OL_APICALL
+olMemFreeWithCodeLoc(ol_device_handle_t Device, ol_alloc_type_t Type,
+                     void *Address, ol_code_location_t *CodeLocation);
+
 #if defined(__cplusplus)
 } // extern "C"
 #endif
diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc
index 49c1c8169615e..08060dae80f03 100644
--- a/offload/liboffload/include/generated/OffloadEntryPoints.inc
+++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc
@@ -439,3 +439,96 @@ ol_result_t olGetDeviceInfoSizeWithCodeLoc(ol_device_handle_t Device,
   currentCodeLocation() = nullptr;
   return Result;
 }
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olMemAlloc_val(ol_device_handle_t Device, ol_alloc_type_t Type,
+                                size_t Size, size_t Aligment,
+                                void **AllocationOut) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Device) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == AllocationOut) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+  }
+
+  return olMemAlloc_impl(Device, Type, Size, Aligment, AllocationOut);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olMemAlloc(ol_device_handle_t Device,
+                                               ol_alloc_type_t Type,
+                                               size_t Size, size_t Aligment,
+                                               void **AllocationOut) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olMemAlloc";
+  }
+
+  ol_result_t Result =
+      olMemAlloc_val(Device, Type, Size, Aligment, AllocationOut);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_mem_alloc_params_t Params = {&Device, &Type, &Size, &Aligment,
+                                    &AllocationOut};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olMemAllocWithCodeLoc(ol_device_handle_t Device,
+                                  ol_alloc_type_t Type, size_t Size,
+                                  size_t Aligment, void **AllocationOut,
+                                  ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olMemAlloc(Device, Type, Size, Aligment, AllocationOut);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olMemFree_val(ol_device_handle_t Device, ol_alloc_type_t Type,
+                               void *Address) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Device) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == Address) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+  }
+
+  return olMemFree_impl(Device, Type, Address);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olMemFree(ol_device_handle_t Device,
+                                              ol_alloc_type_t Type,
+                                              void *Address) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olMemFree";
+  }
+
+  ol_result_t Result = olMemFree_val(Device, Type, Address);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_mem_free_params_t Params = {&Device, &Type, &Address};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olMemFreeWithCodeLoc(ol_device_handle_t Device,
+                                 ol_alloc_type_t Type, void *Address,
+                                 ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olMemFree(Device, Type, Address);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
diff --git a/offload/liboffload/include/generated/OffloadFuncs.inc b/offload/liboffload/include/generated/OffloadFuncs.inc
index 48115493c790f..26120f18279dc 100644
--- a/offload/liboffload/include/generated/OffloadFuncs.inc
+++ b/offload/liboffload/include/generated/OffloadFuncs.inc
@@ -20,6 +20,8 @@ OFFLOAD_FUNC(olGetDeviceCount)
 OFFLOAD_FUNC(olGetDevice)
 OFFLOAD_FUNC(olGetDeviceInfo)
 OFFLOAD_FUNC(olGetDeviceInfoSize)
+OFFLOAD_FUNC(olMemAlloc)
+OFFLOAD_FUNC(olMemFree)
 OFFLOAD_FUNC(olInitWithCodeLoc)
 OFFLOAD_FUNC(olShutDownWithCodeLoc)
 OFFLOAD_FUNC(olGetPlatformWithCodeLoc)
@@ -30,5 +32,7 @@ OFFLOAD_FUNC(olGetDeviceCountWithCodeLoc)
 OFFLOAD_FUNC(olGetDeviceWithCodeLoc)
 OFFLOAD_FUNC(olGetDeviceInfoWithCodeLoc)
 OFFLOAD_FUNC(olGetDeviceInfoSizeWithCodeLoc)
+OFFLOAD_FUNC(olMemAllocWithCodeLoc)
+OFFLOAD_FUNC(olMemFreeWithCodeLoc)
 
 #undef OFFLOAD_FUNC
diff --git a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
index 5b26b2653a05d..f0a96081fd243 100644
--- a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
+++ b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
@@ -36,3 +36,10 @@ ol_impl_result_t olGetDeviceInfo_impl(ol_device_handle_t Device,
 ol_impl_result_t olGetDeviceInfoSize_impl(ol_device_handle_t Device,
                                           ol_device_info_t PropName,
                                           size_t *PropSizeRet);
+
+ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device,
+                                 ol_alloc_type_t Type, size_t Size,
+                                 size_t Aligment, void **AllocationOut);
+
+ol_impl_result_t olMemFree_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
+                                void *Address);
diff --git a/offload/liboffload/include/generated/OffloadPrint.hpp b/offload/liboffload/include/generated/OffloadPrint.hpp
index 8981bb054a4cb..cff754237568e 100644
--- a/offload/liboffload/include/generated/OffloadPrint.hpp
+++ b/offload/liboffload/include/generated/OffloadPrint.hpp
@@ -31,6 +31,7 @@ inline std::ostream &operator<<(std::ostream &os,
                                 enum ol_platform_backend_t value);
 inline std::ostream &operator<<(std::ostream &os, enum ol_device_type_t value);
 inline std::ostream &operator<<(std::ostream &os, enum ol_device_info_t value);
+inline std::ostream &operator<<(std::ostream &os, enum ol_alloc_type_t value);
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ol_errc_t type
@@ -274,6 +275,26 @@ inline void printTagged(std::ostream &os, const void *ptr,
     break;
   }
 }
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ol_alloc_type_t type
+/// @returns std::ostream &
+inline std::ostream &operator<<(std::ostream &os, enum ol_alloc_type_t value) {
+  switch (value) {
+  case OL_ALLOC_TYPE_HOST:
+    os << "OL_ALLOC_TYPE_HOST";
+    break;
+  case OL_ALLOC_TYPE_DEVICE:
+    os << "OL_ALLOC_TYPE_DEVICE";
+    break;
+  case OL_ALLOC_TYPE_SHARED:
+    os << "OL_ALLOC_TYPE_SHARED";
+    break;
+  default:
+    os << "unknown enumerator";
+    break;
+  }
+  return os;
+}
 
 inline std::ostream &operator<<(std::ostream &os,
                                 const ol_error_struct_t *Err) {
@@ -402,6 +423,38 @@ operator<<(std::ostream &os,
   return os;
 }
 
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ol_mem_alloc_params_t *params) {
+  os << ".Device = ";
+  printPtr(os, *params->pDevice);
+  os << ", ";
+  os << ".Type = ";
+  os << *params->pType;
+  os << ", ";
+  os << ".Size = ";
+  os << *params->pSize;
+  os << ", ";
+  os << ".Aligment = ";
+  os << *params->pAligment;
+  os << ", ";
+  os << ".AllocationOut = ";
+  printPtr(os, *params->pAllocationOut);
+  return os;
+}
+
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ol_mem_free_params_t *params) {
+  os << ".Device = ";
+  printPtr(os, *params->pDevice);
+  os << ", ";
+  os << ".Type = ";
+  os << *params->pType;
+  os << ", ";
+  os << ".Address = ";
+  printPtr(os, *params->pAddress);
+  return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // @brief Print pointer value
 template <typename T>
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 457f1053f1634..3e609ed03917f 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -245,3 +245,38 @@ ol_impl_result_t olGetDeviceInfoSize_impl(ol_device_handle_t Device,
                                           size_t *PropSizeRet) {
   return olGetDeviceInfoImplDetail(Device, PropName, 0, nullptr, PropSizeRet);
 }
+
+TargetAllocTy convertOlToPluginAllocTy(ol_alloc_type_t Type) {
+  switch (Type) {
+  case OL_ALLOC_TYPE_DEVICE:
+    return TARGET_ALLOC_DEVICE;
+  case OL_ALLOC_TYPE_HOST:
+    return TARGET_ALLOC_HOST;
+  case OL_ALLOC_TYPE_SHARED:
+  default:
+    return TARGET_ALLOC_SHARED;
+  }
+}
+
+ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device,
+                                 ol_alloc_type_t Type, size_t Size, size_t,
+                                 void **AllocationOut) {
+  auto Alloc =
+      Device->Device.dataAlloc(Size, nullptr, convertOlToPluginAllocTy(Type));
+  if (!Alloc) {
+    return {OL_ERRC_OUT_OF_RESOURCES,
+            formatv("Could not create allocation on device {0}", Device).str()};
+  }
+
+  *AllocationOut = *Alloc;
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olMemFree_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
+                                void *Address) {
+  auto Res = Device->Device.dataDelete(Address, convertOlToPluginAllocTy(Type));
+  if (Res) {
+    return {OL_ERRC_OUT_OF_RESOURCES, "Could not free allocation"};
+  }
+  return OL_SUCCESS;
+}

>From 73ed36a366dec72b63dccdc24d240e0efc0bf528 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Wed, 11 Dec 2024 12:13:29 +0000
Subject: [PATCH 02/17] Add size check

---
 offload/liboffload/API/Memory.td                            | 6 +++++-
 offload/liboffload/include/generated/OffloadAPI.h           | 2 ++
 offload/liboffload/include/generated/OffloadEntryPoints.inc | 4 ++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td
index 8cfaf70311e34..c15ae6f6d21ca 100644
--- a/offload/liboffload/API/Memory.td
+++ b/offload/liboffload/API/Memory.td
@@ -30,7 +30,11 @@ def : Function {
     Param<"size_t", "Aligment", "alignment of the allocation in bytes", PARAM_IN>,
     Param<"void**", "AllocationOut", "output for the allocated pointer", PARAM_OUT>
   ];
-  let returns = [];
+  let returns = [
+    Return<"OL_ERRC_INVALID_SIZE", [
+      "`Size == 0`"
+    ]>
+  ];
 }
 
 def : Function {
diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index 81f3a8e0201ba..4c3356645e55a 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -484,6 +484,8 @@ typedef enum ol_alloc_type_t {
 ///     - ::OL_RESULT_SUCCESS
 ///     - ::OL_ERRC_UNINITIALIZED
 ///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_SIZE
+///         + `Size == 0`
 ///     - ::OL_ERRC_INVALID_NULL_HANDLE
 ///         + `NULL == Device`
 ///     - ::OL_ERRC_INVALID_NULL_POINTER
diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc
index 08060dae80f03..bcde65452b265 100644
--- a/offload/liboffload/include/generated/OffloadEntryPoints.inc
+++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc
@@ -445,6 +445,10 @@ ol_impl_result_t olMemAlloc_val(ol_device_handle_t Device, ol_alloc_type_t Type,
                                 size_t Size, size_t Aligment,
                                 void **AllocationOut) {
   if (true /*enableParameterValidation*/) {
+    if (Size == 0) {
+      return OL_ERRC_INVALID_SIZE;
+    }
+
     if (NULL == Device) {
       return OL_ERRC_INVALID_NULL_HANDLE;
     }

>From be5c36bd2b23fc9eb7886586d8687bde4de145e0 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Mon, 6 Jan 2025 15:22:52 +0000
Subject: [PATCH 03/17] Implement minimum Offload API needed to launch a SYCL
 kernel

---
 offload/liboffload/API/Common.td              |  20 +
 offload/liboffload/API/Enqueue.td             |  68 ++
 offload/liboffload/API/Event.td               |  41 +
 offload/liboffload/API/Kernel.td              |  44 +
 offload/liboffload/API/OffloadAPI.td          |   5 +
 offload/liboffload/API/Program.td             |  44 +
 offload/liboffload/API/Queue.td               |  52 ++
 .../liboffload/include/generated/OffloadAPI.h | 656 +++++++++++++++
 .../include/generated/OffloadEntryPoints.inc  | 775 ++++++++++++++++++
 .../include/generated/OffloadFuncs.inc        |  36 +
 .../generated/OffloadImplFuncDecls.inc        |  54 ++
 .../include/generated/OffloadPrint.hpp        | 210 +++++
 offload/liboffload/src/OffloadImpl.cpp        | 360 ++++++++
 .../common/include/GlobalHandler.h            |   5 +-
 offload/plugins-nextgen/cuda/src/rtl.cpp      |  28 +
 offload/plugins-nextgen/host/src/rtl.cpp      |   4 +-
 offload/unittests/OffloadAPI/CMakeLists.txt   |   4 +-
 .../OffloadAPI/queue/olCreateQueue.cpp        |  19 +
 18 files changed, 2420 insertions(+), 5 deletions(-)
 create mode 100644 offload/liboffload/API/Enqueue.td
 create mode 100644 offload/liboffload/API/Event.td
 create mode 100644 offload/liboffload/API/Kernel.td
 create mode 100644 offload/liboffload/API/Program.td
 create mode 100644 offload/liboffload/API/Queue.td
 create mode 100644 offload/unittests/OffloadAPI/queue/olCreateQueue.cpp

diff --git a/offload/liboffload/API/Common.td b/offload/liboffload/API/Common.td
index 5b19d1d47129e..7fedb2002f157 100644
--- a/offload/liboffload/API/Common.td
+++ b/offload/liboffload/API/Common.td
@@ -62,6 +62,26 @@ def : Handle {
   let desc = "Handle of context object";
 }
 
+def : Handle {
+  let name = "ol_queue_handle_t";
+  let desc = "Handle of queue object";
+}
+
+def : Handle {
+  let name = "ol_event_handle_t";
+  let desc = "Handle of event object";
+}
+
+def : Handle {
+  let name = "ol_program_handle_t";
+  let desc = "Handle of program object";
+}
+
+def : Handle {
+  let name = "ol_kernel_handle_t";
+  let desc = "Handle of kernel object";
+}
+
 def : Enum {
   let name = "ol_errc_t";
   let desc = "Defines Return/Error codes";
diff --git a/offload/liboffload/API/Enqueue.td b/offload/liboffload/API/Enqueue.td
new file mode 100644
index 0000000000000..621eb3a2f410e
--- /dev/null
+++ b/offload/liboffload/API/Enqueue.td
@@ -0,0 +1,68 @@
+//===-- Enqueue.td - Enqueue definitions for Offload -------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains Offload API definitions related to enqueable operations
+//
+//===----------------------------------------------------------------------===//
+
+def : Function {
+    let name = "olEnqueueDataWrite";
+    let desc = "Enqueue a write operation from host to device memory";
+    let details = [];
+    let params = [
+        Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
+        Param<"void*", "SrcPtr", "host pointer to copy from", PARAM_IN>,
+        Param<"void*", "DstPtr", "device pointer to copy to", PARAM_IN>,
+        Param<"size_t", "Size", "size in bytes of data to copy", PARAM_IN>,
+        Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
+    ];
+    let returns = [];
+}
+
+def : Function {
+    let name = "olEnqueueDataRead";
+    let desc = "Enqueue a read operation from device to host memory";
+    let details = [];
+    let params = [
+        Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
+        Param<"void*", "SrcPtr", "device pointer to copy from", PARAM_IN>,
+        Param<"void*", "DstPtr", "host pointer to copy to", PARAM_IN>,
+        Param<"size_t", "Size", "size in bytes of data to copy", PARAM_IN>,
+        Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
+    ];
+    let returns = [];
+}
+
+def : Function {
+    let name = "olEnqueueDataCopy";
+    let desc = "Enqueue a write operation between device allocations";
+    let details = [];
+    let params = [
+        Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
+        Param<"void*", "SrcPtr", "device pointer to copy from", PARAM_IN>,
+        Param<"void*", "DstPtr", "device pointer to copy to", PARAM_IN>,
+        Param<"ol_device_handle_t", "DstDevice", "device that the destination pointer is resident on", PARAM_IN>,
+        Param<"size_t", "Size", "size in bytes of data to copy", PARAM_IN>,
+        Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
+    ];
+    let returns = [];
+}
+
+
+def : Function {
+    let name = "olEnqueueKernelLaunch";
+    let desc = "Enqueue a kernel launch with the specified size and parameters";
+    let details = [];
+    let params = [
+        Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
+        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"const size_t*", "GlobalWorkSize", "an array of size 3 representing the global work size", PARAM_IN>,
+        Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
+    ];
+    let returns = [];
+}
diff --git a/offload/liboffload/API/Event.td b/offload/liboffload/API/Event.td
new file mode 100644
index 0000000000000..db90a7c8e2be4
--- /dev/null
+++ b/offload/liboffload/API/Event.td
@@ -0,0 +1,41 @@
+//===-- Event.td - Event definitions for Offload -----------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains Offload API definitions related to the event handle
+//
+//===----------------------------------------------------------------------===//
+
+def : Function {
+    let name = "olRetainEvent";
+    let desc = "Increment the reference count of the given event";
+    let details = [];
+    let params = [
+        Param<"ol_event_handle_t", "Event", "handle of the event", PARAM_IN>
+    ];
+    let returns = [];
+}
+
+def : Function {
+    let name = "olReleaseEvent";
+    let desc = "Decrement the reference count of the given event";
+    let details = [];
+    let params = [
+        Param<"ol_event_handle_t", "Event", "handle of the event", PARAM_IN>
+    ];
+    let returns = [];
+}
+
+def : Function {
+    let name = "olWaitEvent";
+    let desc = "Wait for the event to be complete";
+    let details = [];
+    let params = [
+        Param<"ol_event_handle_t", "Event", "handle of the event", PARAM_IN>
+    ];
+    let returns = [];
+}
diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
new file mode 100644
index 0000000000000..936372c18ca37
--- /dev/null
+++ b/offload/liboffload/API/Kernel.td
@@ -0,0 +1,44 @@
+def : Function {
+    let name = "olCreateKernel";
+    let desc = "";
+    let details = [];
+    let params = [
+        Param<"ol_program_handle_t", "Program", "handle of the program", PARAM_IN>,
+        Param<"const char*", "KernelName", "name of the kernel entry point in the program", PARAM_IN>,
+        Param<"ol_kernel_handle_t*", "Kernel", "output pointer for the created kernel", PARAM_OUT>
+    ];
+    let returns = [];
+}
+
+def : Function {
+    let name = "olRetainKernel";
+    let desc = "Create a queue for the given device";
+    let details = [];
+    let params = [
+        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>
+    ];
+    let returns = [];
+}
+
+def : Function {
+    let name = "olReleaseKernel";
+    let desc = "Create a queue for the given device";
+    let details = [];
+    let params = [
+        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>
+    ];
+    let returns = [];
+}
+
+def : Function {
+    let name = "olSetKernelArgValue";
+    let desc = "Create a queue for the given device";
+    let details = [];
+    let params = [
+        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"uint32_t", "Index", "index of the argument", PARAM_IN>,
+        Param<"size_t", "Size", "size of the argument data", PARAM_IN>,
+        Param<"void*", "ArgData", "pointer to the argument data", PARAM_IN>
+    ];
+    let returns = [];
+}
diff --git a/offload/liboffload/API/OffloadAPI.td b/offload/liboffload/API/OffloadAPI.td
index a609cc7ac80b4..f2822b93e6bf8 100644
--- a/offload/liboffload/API/OffloadAPI.td
+++ b/offload/liboffload/API/OffloadAPI.td
@@ -14,3 +14,8 @@ include "Common.td"
 include "Platform.td"
 include "Device.td"
 include "Memory.td"
+include "Queue.td"
+include "Event.td"
+include "Enqueue.td"
+include "Program.td"
+include "Kernel.td"
diff --git a/offload/liboffload/API/Program.td b/offload/liboffload/API/Program.td
new file mode 100644
index 0000000000000..684a6581320f8
--- /dev/null
+++ b/offload/liboffload/API/Program.td
@@ -0,0 +1,44 @@
+//===-- Program.td - Program definitions for Offload -------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains Offload API definitions related to the program handle
+//
+//===----------------------------------------------------------------------===//
+
+def : Function {
+    let name = "olCreateProgram";
+    let desc = "";
+    let details = [];
+    let params = [
+        Param<"ol_device_handle_t", "Device", "handle of the device", PARAM_IN>,
+        Param<"void*", "ProgData", "pointer to the program binary data", PARAM_IN>,
+        Param<"size_t", "ProgDataSize", "size of the program binary in bytes", PARAM_IN>,
+        Param<"ol_program_handle_t*", "Queue", "output pointer for the created program", PARAM_OUT>
+    ];
+    let returns = [];
+}
+
+def : Function {
+    let name = "olRetainProgram";
+    let desc = "Create a queue for the given device";
+    let details = [];
+    let params = [
+        Param<"ol_program_handle_t", "Program", "handle of the program", PARAM_IN>
+    ];
+    let returns = [];
+}
+
+def : Function {
+    let name = "olReleaseProgram";
+    let desc = "Create a queue for the given device";
+    let details = [];
+    let params = [
+        Param<"ol_program_handle_t", "Program", "handle of the program", PARAM_IN>
+    ];
+    let returns = [];
+}
diff --git a/offload/liboffload/API/Queue.td b/offload/liboffload/API/Queue.td
new file mode 100644
index 0000000000000..5629fa40d56d5
--- /dev/null
+++ b/offload/liboffload/API/Queue.td
@@ -0,0 +1,52 @@
+//===-- Queue.td - Queue definitions for Offload -----------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains Offload API definitions related to the queue handle
+//
+//===----------------------------------------------------------------------===//
+
+def : Function {
+    let name = "olCreateQueue";
+    let desc = "Create a queue for the given device";
+    let details = [];
+    let params = [
+        Param<"ol_device_handle_t", "Device", "handle of the device", PARAM_IN>,
+        Param<"ol_queue_handle_t*", "Queue", "output pointer for the created queue", PARAM_OUT>
+    ];
+    let returns = [];
+}
+
+def : Function {
+    let name = "olRetainQueue";
+    let desc = "Create a queue for the given device";
+    let details = [];
+    let params = [
+        Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>
+    ];
+    let returns = [];
+}
+
+def : Function {
+    let name = "olReleaseQueue";
+    let desc = "Create a queue for the given device";
+    let details = [];
+    let params = [
+        Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>
+    ];
+    let returns = [];
+}
+
+def : Function {
+    let name = "olFinishQueue";
+    let desc = "Wait for the enqueued work on a queue to complete";
+    let details = [];
+    let params = [
+        Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>
+    ];
+    let returns = [];
+}
diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index 4c3356645e55a..2384de19ae72e 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -85,6 +85,22 @@ typedef struct ol_device_handle_t_ *ol_device_handle_t;
 /// @brief Handle of context object
 typedef struct ol_context_handle_t_ *ol_context_handle_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of queue object
+typedef struct ol_queue_handle_t_ *ol_queue_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of event object
+typedef struct ol_event_handle_t_ *ol_event_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of program object
+typedef struct ol_program_handle_t_ *ol_program_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of kernel object
+typedef struct ol_kernel_handle_t_ *ol_kernel_handle_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Defines Return/Error codes
 typedef enum ol_errc_t {
@@ -523,6 +539,359 @@ OL_APIEXPORT ol_result_t OL_APICALL olMemFree(
     // [in] address of the allocation to free
     void *Address);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create a queue for the given device
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Device`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == Queue`
+OL_APIEXPORT ol_result_t OL_APICALL olCreateQueue(
+    // [in] handle of the device
+    ol_device_handle_t Device,
+    // [out] output pointer for the created queue
+    ol_queue_handle_t *Queue);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create a queue for the given device
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Queue`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+OL_APIEXPORT ol_result_t OL_APICALL olRetainQueue(
+    // [in] handle of the queue
+    ol_queue_handle_t Queue);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create a queue for the given device
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Queue`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+OL_APIEXPORT ol_result_t OL_APICALL olReleaseQueue(
+    // [in] handle of the queue
+    ol_queue_handle_t Queue);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Wait for the enqueued work on a queue to complete
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Queue`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+OL_APIEXPORT ol_result_t OL_APICALL olFinishQueue(
+    // [in] handle of the queue
+    ol_queue_handle_t Queue);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Increment the reference count of the given event
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Event`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+OL_APIEXPORT ol_result_t OL_APICALL olRetainEvent(
+    // [in] handle of the event
+    ol_event_handle_t Event);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Decrement the reference count of the given event
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Event`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+OL_APIEXPORT ol_result_t OL_APICALL olReleaseEvent(
+    // [in] handle of the event
+    ol_event_handle_t Event);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Wait for the event to be complete
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Event`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+OL_APIEXPORT ol_result_t OL_APICALL olWaitEvent(
+    // [in] handle of the event
+    ol_event_handle_t Event);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a write operation from host to device memory
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Queue`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == SrcPtr`
+///         + `NULL == DstPtr`
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataWrite(
+    // [in] handle of the queue
+    ol_queue_handle_t Queue,
+    // [in] host pointer to copy from
+    void *SrcPtr,
+    // [in] device pointer to copy to
+    void *DstPtr,
+    // [in] size in bytes of data to copy
+    size_t Size,
+    // [out][optional] optional recorded event for the enqueued operation
+    ol_event_handle_t *EventOut);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a read operation from device to host memory
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Queue`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == SrcPtr`
+///         + `NULL == DstPtr`
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataRead(
+    // [in] handle of the queue
+    ol_queue_handle_t Queue,
+    // [in] device pointer to copy from
+    void *SrcPtr,
+    // [in] host pointer to copy to
+    void *DstPtr,
+    // [in] size in bytes of data to copy
+    size_t Size,
+    // [out][optional] optional recorded event for the enqueued operation
+    ol_event_handle_t *EventOut);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a write operation between device allocations
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Queue`
+///         + `NULL == DstDevice`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == SrcPtr`
+///         + `NULL == DstPtr`
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataCopy(
+    // [in] handle of the queue
+    ol_queue_handle_t Queue,
+    // [in] device pointer to copy from
+    void *SrcPtr,
+    // [in] device pointer to copy to
+    void *DstPtr,
+    // [in] device that the destination pointer is resident on
+    ol_device_handle_t DstDevice,
+    // [in] size in bytes of data to copy
+    size_t Size,
+    // [out][optional] optional recorded event for the enqueued operation
+    ol_event_handle_t *EventOut);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a kernel launch with the specified size and parameters
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Queue`
+///         + `NULL == Kernel`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == GlobalWorkSize`
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueKernelLaunch(
+    // [in] handle of the queue
+    ol_queue_handle_t Queue,
+    // [in] handle of the kernel
+    ol_kernel_handle_t Kernel,
+    // [in] an array of size 3 representing the global work size
+    const size_t *GlobalWorkSize,
+    // [out][optional] optional recorded event for the enqueued operation
+    ol_event_handle_t *EventOut);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Device`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == ProgData`
+///         + `NULL == Queue`
+OL_APIEXPORT ol_result_t OL_APICALL olCreateProgram(
+    // [in] handle of the device
+    ol_device_handle_t Device,
+    // [in] pointer to the program binary data
+    void *ProgData,
+    // [in] size of the program binary in bytes
+    size_t ProgDataSize,
+    // [out] output pointer for the created program
+    ol_program_handle_t *Queue);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create a queue for the given device
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Program`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+OL_APIEXPORT ol_result_t OL_APICALL olRetainProgram(
+    // [in] handle of the program
+    ol_program_handle_t Program);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create a queue for the given device
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Program`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+OL_APIEXPORT ol_result_t OL_APICALL olReleaseProgram(
+    // [in] handle of the program
+    ol_program_handle_t Program);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Program`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == KernelName`
+///         + `NULL == Kernel`
+OL_APIEXPORT ol_result_t OL_APICALL olCreateKernel(
+    // [in] handle of the program
+    ol_program_handle_t Program,
+    // [in] name of the kernel entry point in the program
+    const char *KernelName,
+    // [out] output pointer for the created kernel
+    ol_kernel_handle_t *Kernel);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create a queue for the given device
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Kernel`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+OL_APIEXPORT ol_result_t OL_APICALL olRetainKernel(
+    // [in] handle of the kernel
+    ol_kernel_handle_t Kernel);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create a queue for the given device
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Kernel`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+OL_APIEXPORT ol_result_t OL_APICALL olReleaseKernel(
+    // [in] handle of the kernel
+    ol_kernel_handle_t Kernel);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create a queue for the given device
+///
+/// @details
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Kernel`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == ArgData`
+OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgValue(
+    // [in] handle of the kernel
+    ol_kernel_handle_t Kernel,
+    // [in] index of the argument
+    uint32_t Index,
+    // [in] size of the argument data
+    size_t Size,
+    // [in] pointer to the argument data
+    void *ArgData);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for olGetPlatform
 /// @details Each entry is a pointer to the parameter passed to the function;
@@ -613,6 +982,157 @@ typedef struct ol_mem_free_params_t {
   void **pAddress;
 } ol_mem_free_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olCreateQueue
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_create_queue_params_t {
+  ol_device_handle_t *pDevice;
+  ol_queue_handle_t **pQueue;
+} ol_create_queue_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olRetainQueue
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_retain_queue_params_t {
+  ol_queue_handle_t *pQueue;
+} ol_retain_queue_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olReleaseQueue
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_release_queue_params_t {
+  ol_queue_handle_t *pQueue;
+} ol_release_queue_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olFinishQueue
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_finish_queue_params_t {
+  ol_queue_handle_t *pQueue;
+} ol_finish_queue_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olRetainEvent
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_retain_event_params_t {
+  ol_event_handle_t *pEvent;
+} ol_retain_event_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olReleaseEvent
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_release_event_params_t {
+  ol_event_handle_t *pEvent;
+} ol_release_event_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olWaitEvent
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_wait_event_params_t {
+  ol_event_handle_t *pEvent;
+} ol_wait_event_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olEnqueueDataWrite
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_enqueue_data_write_params_t {
+  ol_queue_handle_t *pQueue;
+  void **pSrcPtr;
+  void **pDstPtr;
+  size_t *pSize;
+  ol_event_handle_t **pEventOut;
+} ol_enqueue_data_write_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olEnqueueDataRead
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_enqueue_data_read_params_t {
+  ol_queue_handle_t *pQueue;
+  void **pSrcPtr;
+  void **pDstPtr;
+  size_t *pSize;
+  ol_event_handle_t **pEventOut;
+} ol_enqueue_data_read_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olEnqueueDataCopy
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_enqueue_data_copy_params_t {
+  ol_queue_handle_t *pQueue;
+  void **pSrcPtr;
+  void **pDstPtr;
+  ol_device_handle_t *pDstDevice;
+  size_t *pSize;
+  ol_event_handle_t **pEventOut;
+} ol_enqueue_data_copy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olEnqueueKernelLaunch
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_enqueue_kernel_launch_params_t {
+  ol_queue_handle_t *pQueue;
+  ol_kernel_handle_t *pKernel;
+  const size_t **pGlobalWorkSize;
+  ol_event_handle_t **pEventOut;
+} ol_enqueue_kernel_launch_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olCreateProgram
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_create_program_params_t {
+  ol_device_handle_t *pDevice;
+  void **pProgData;
+  size_t *pProgDataSize;
+  ol_program_handle_t **pQueue;
+} ol_create_program_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olRetainProgram
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_retain_program_params_t {
+  ol_program_handle_t *pProgram;
+} ol_retain_program_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olReleaseProgram
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_release_program_params_t {
+  ol_program_handle_t *pProgram;
+} ol_release_program_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olCreateKernel
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_create_kernel_params_t {
+  ol_program_handle_t *pProgram;
+  const char **pKernelName;
+  ol_kernel_handle_t **pKernel;
+} ol_create_kernel_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olRetainKernel
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_retain_kernel_params_t {
+  ol_kernel_handle_t *pKernel;
+} ol_retain_kernel_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olReleaseKernel
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_release_kernel_params_t {
+  ol_kernel_handle_t *pKernel;
+} ol_release_kernel_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olSetKernelArgValue
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_set_kernel_arg_value_params_t {
+  ol_kernel_handle_t *pKernel;
+  uint32_t *pIndex;
+  size_t *pSize;
+  void **pArgData;
+} ol_set_kernel_arg_value_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Variant of olInit that also sets source code location information
 /// @details See also ::olInit
@@ -702,6 +1222,142 @@ OL_APIEXPORT ol_result_t OL_APICALL
 olMemFreeWithCodeLoc(ol_device_handle_t Device, ol_alloc_type_t Type,
                      void *Address, ol_code_location_t *CodeLocation);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olCreateQueue that also sets source code location
+/// information
+/// @details See also ::olCreateQueue
+OL_APIEXPORT ol_result_t OL_APICALL
+olCreateQueueWithCodeLoc(ol_device_handle_t Device, ol_queue_handle_t *Queue,
+                         ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olRetainQueue that also sets source code location
+/// information
+/// @details See also ::olRetainQueue
+OL_APIEXPORT ol_result_t OL_APICALL olRetainQueueWithCodeLoc(
+    ol_queue_handle_t Queue, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olReleaseQueue that also sets source code location
+/// information
+/// @details See also ::olReleaseQueue
+OL_APIEXPORT ol_result_t OL_APICALL olReleaseQueueWithCodeLoc(
+    ol_queue_handle_t Queue, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olFinishQueue that also sets source code location
+/// information
+/// @details See also ::olFinishQueue
+OL_APIEXPORT ol_result_t OL_APICALL olFinishQueueWithCodeLoc(
+    ol_queue_handle_t Queue, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olRetainEvent that also sets source code location
+/// information
+/// @details See also ::olRetainEvent
+OL_APIEXPORT ol_result_t OL_APICALL olRetainEventWithCodeLoc(
+    ol_event_handle_t Event, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olReleaseEvent that also sets source code location
+/// information
+/// @details See also ::olReleaseEvent
+OL_APIEXPORT ol_result_t OL_APICALL olReleaseEventWithCodeLoc(
+    ol_event_handle_t Event, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olWaitEvent that also sets source code location
+/// information
+/// @details See also ::olWaitEvent
+OL_APIEXPORT ol_result_t OL_APICALL olWaitEventWithCodeLoc(
+    ol_event_handle_t Event, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olEnqueueDataWrite that also sets source code location
+/// information
+/// @details See also ::olEnqueueDataWrite
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataWriteWithCodeLoc(
+    ol_queue_handle_t Queue, void *SrcPtr, void *DstPtr, size_t Size,
+    ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olEnqueueDataRead that also sets source code location
+/// information
+/// @details See also ::olEnqueueDataRead
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataReadWithCodeLoc(
+    ol_queue_handle_t Queue, void *SrcPtr, void *DstPtr, size_t Size,
+    ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olEnqueueDataCopy that also sets source code location
+/// information
+/// @details See also ::olEnqueueDataCopy
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataCopyWithCodeLoc(
+    ol_queue_handle_t Queue, void *SrcPtr, void *DstPtr,
+    ol_device_handle_t DstDevice, size_t Size, ol_event_handle_t *EventOut,
+    ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olEnqueueKernelLaunch that also sets source code location
+/// information
+/// @details See also ::olEnqueueKernelLaunch
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueKernelLaunchWithCodeLoc(
+    ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
+    const size_t *GlobalWorkSize, ol_event_handle_t *EventOut,
+    ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olCreateProgram that also sets source code location
+/// information
+/// @details See also ::olCreateProgram
+OL_APIEXPORT ol_result_t OL_APICALL olCreateProgramWithCodeLoc(
+    ol_device_handle_t Device, void *ProgData, size_t ProgDataSize,
+    ol_program_handle_t *Queue, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olRetainProgram that also sets source code location
+/// information
+/// @details See also ::olRetainProgram
+OL_APIEXPORT ol_result_t OL_APICALL olRetainProgramWithCodeLoc(
+    ol_program_handle_t Program, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olReleaseProgram that also sets source code location
+/// information
+/// @details See also ::olReleaseProgram
+OL_APIEXPORT ol_result_t OL_APICALL olReleaseProgramWithCodeLoc(
+    ol_program_handle_t Program, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olCreateKernel that also sets source code location
+/// information
+/// @details See also ::olCreateKernel
+OL_APIEXPORT ol_result_t OL_APICALL olCreateKernelWithCodeLoc(
+    ol_program_handle_t Program, const char *KernelName,
+    ol_kernel_handle_t *Kernel, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olRetainKernel that also sets source code location
+/// information
+/// @details See also ::olRetainKernel
+OL_APIEXPORT ol_result_t OL_APICALL olRetainKernelWithCodeLoc(
+    ol_kernel_handle_t Kernel, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olReleaseKernel that also sets source code location
+/// information
+/// @details See also ::olReleaseKernel
+OL_APIEXPORT ol_result_t OL_APICALL olReleaseKernelWithCodeLoc(
+    ol_kernel_handle_t Kernel, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olSetKernelArgValue that also sets source code location
+/// information
+/// @details See also ::olSetKernelArgValue
+OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgValueWithCodeLoc(
+    ol_kernel_handle_t Kernel, uint32_t Index, size_t Size, void *ArgData,
+    ol_code_location_t *CodeLocation);
+
 #if defined(__cplusplus)
 } // extern "C"
 #endif
diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc
index bcde65452b265..0ae3c36f95827 100644
--- a/offload/liboffload/include/generated/OffloadEntryPoints.inc
+++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc
@@ -536,3 +536,778 @@ ol_result_t olMemFreeWithCodeLoc(ol_device_handle_t Device,
   currentCodeLocation() = nullptr;
   return Result;
 }
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olCreateQueue_val(ol_device_handle_t Device,
+                                   ol_queue_handle_t *Queue) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Device) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == Queue) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+  }
+
+  return olCreateQueue_impl(Device, Queue);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olCreateQueue(ol_device_handle_t Device,
+                                                  ol_queue_handle_t *Queue) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olCreateQueue";
+  }
+
+  ol_result_t Result = olCreateQueue_val(Device, Queue);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_create_queue_params_t Params = {&Device, &Queue};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olCreateQueueWithCodeLoc(ol_device_handle_t Device,
+                                     ol_queue_handle_t *Queue,
+                                     ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olCreateQueue(Device, Queue);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olRetainQueue_val(ol_queue_handle_t Queue) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Queue) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+  }
+
+  return olRetainQueue_impl(Queue);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olRetainQueue(ol_queue_handle_t Queue) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olRetainQueue";
+  }
+
+  ol_result_t Result = olRetainQueue_val(Queue);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_retain_queue_params_t Params = {&Queue};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olRetainQueueWithCodeLoc(ol_queue_handle_t Queue,
+                                     ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olRetainQueue(Queue);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olReleaseQueue_val(ol_queue_handle_t Queue) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Queue) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+  }
+
+  return olReleaseQueue_impl(Queue);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olReleaseQueue(ol_queue_handle_t Queue) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olReleaseQueue";
+  }
+
+  ol_result_t Result = olReleaseQueue_val(Queue);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_release_queue_params_t Params = {&Queue};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olReleaseQueueWithCodeLoc(ol_queue_handle_t Queue,
+                                      ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olReleaseQueue(Queue);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olFinishQueue_val(ol_queue_handle_t Queue) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Queue) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+  }
+
+  return olFinishQueue_impl(Queue);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olFinishQueue(ol_queue_handle_t Queue) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olFinishQueue";
+  }
+
+  ol_result_t Result = olFinishQueue_val(Queue);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_finish_queue_params_t Params = {&Queue};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olFinishQueueWithCodeLoc(ol_queue_handle_t Queue,
+                                     ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olFinishQueue(Queue);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olRetainEvent_val(ol_event_handle_t Event) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Event) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+  }
+
+  return olRetainEvent_impl(Event);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olRetainEvent(ol_event_handle_t Event) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olRetainEvent";
+  }
+
+  ol_result_t Result = olRetainEvent_val(Event);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_retain_event_params_t Params = {&Event};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olRetainEventWithCodeLoc(ol_event_handle_t Event,
+                                     ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olRetainEvent(Event);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olReleaseEvent_val(ol_event_handle_t Event) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Event) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+  }
+
+  return olReleaseEvent_impl(Event);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olReleaseEvent(ol_event_handle_t Event) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olReleaseEvent";
+  }
+
+  ol_result_t Result = olReleaseEvent_val(Event);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_release_event_params_t Params = {&Event};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olReleaseEventWithCodeLoc(ol_event_handle_t Event,
+                                      ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olReleaseEvent(Event);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olWaitEvent_val(ol_event_handle_t Event) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Event) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+  }
+
+  return olWaitEvent_impl(Event);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olWaitEvent(ol_event_handle_t Event) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olWaitEvent";
+  }
+
+  ol_result_t Result = olWaitEvent_val(Event);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_wait_event_params_t Params = {&Event};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olWaitEventWithCodeLoc(ol_event_handle_t Event,
+                                   ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olWaitEvent(Event);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olEnqueueDataWrite_val(ol_queue_handle_t Queue, void *SrcPtr,
+                                        void *DstPtr, size_t Size,
+                                        ol_event_handle_t *EventOut) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Queue) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == SrcPtr) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+
+    if (NULL == DstPtr) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+  }
+
+  return olEnqueueDataWrite_impl(Queue, SrcPtr, DstPtr, Size, EventOut);
+}
+OL_APIEXPORT ol_result_t OL_APICALL
+olEnqueueDataWrite(ol_queue_handle_t Queue, void *SrcPtr, void *DstPtr,
+                   size_t Size, ol_event_handle_t *EventOut) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olEnqueueDataWrite";
+  }
+
+  ol_result_t Result =
+      olEnqueueDataWrite_val(Queue, SrcPtr, DstPtr, Size, EventOut);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_enqueue_data_write_params_t Params = {&Queue, &SrcPtr, &DstPtr, &Size,
+                                             &EventOut};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olEnqueueDataWriteWithCodeLoc(ol_queue_handle_t Queue, void *SrcPtr,
+                                          void *DstPtr, size_t Size,
+                                          ol_event_handle_t *EventOut,
+                                          ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result =
+      olEnqueueDataWrite(Queue, SrcPtr, DstPtr, Size, EventOut);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olEnqueueDataRead_val(ol_queue_handle_t Queue, void *SrcPtr,
+                                       void *DstPtr, size_t Size,
+                                       ol_event_handle_t *EventOut) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Queue) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == SrcPtr) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+
+    if (NULL == DstPtr) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+  }
+
+  return olEnqueueDataRead_impl(Queue, SrcPtr, DstPtr, Size, EventOut);
+}
+OL_APIEXPORT ol_result_t OL_APICALL
+olEnqueueDataRead(ol_queue_handle_t Queue, void *SrcPtr, void *DstPtr,
+                  size_t Size, ol_event_handle_t *EventOut) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olEnqueueDataRead";
+  }
+
+  ol_result_t Result =
+      olEnqueueDataRead_val(Queue, SrcPtr, DstPtr, Size, EventOut);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_enqueue_data_read_params_t Params = {&Queue, &SrcPtr, &DstPtr, &Size,
+                                            &EventOut};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olEnqueueDataReadWithCodeLoc(ol_queue_handle_t Queue, void *SrcPtr,
+                                         void *DstPtr, size_t Size,
+                                         ol_event_handle_t *EventOut,
+                                         ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olEnqueueDataRead(Queue, SrcPtr, DstPtr, Size, EventOut);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olEnqueueDataCopy_val(ol_queue_handle_t Queue, void *SrcPtr,
+                                       void *DstPtr,
+                                       ol_device_handle_t DstDevice,
+                                       size_t Size,
+                                       ol_event_handle_t *EventOut) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Queue) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == DstDevice) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == SrcPtr) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+
+    if (NULL == DstPtr) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+  }
+
+  return olEnqueueDataCopy_impl(Queue, SrcPtr, DstPtr, DstDevice, Size,
+                                EventOut);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataCopy(
+    ol_queue_handle_t Queue, void *SrcPtr, void *DstPtr,
+    ol_device_handle_t DstDevice, size_t Size, ol_event_handle_t *EventOut) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olEnqueueDataCopy";
+  }
+
+  ol_result_t Result =
+      olEnqueueDataCopy_val(Queue, SrcPtr, DstPtr, DstDevice, Size, EventOut);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_enqueue_data_copy_params_t Params = {&Queue,     &SrcPtr, &DstPtr,
+                                            &DstDevice, &Size,   &EventOut};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olEnqueueDataCopyWithCodeLoc(ol_queue_handle_t Queue, void *SrcPtr,
+                                         void *DstPtr,
+                                         ol_device_handle_t DstDevice,
+                                         size_t Size,
+                                         ol_event_handle_t *EventOut,
+                                         ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result =
+      olEnqueueDataCopy(Queue, SrcPtr, DstPtr, DstDevice, Size, EventOut);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olEnqueueKernelLaunch_val(ol_queue_handle_t Queue,
+                                           ol_kernel_handle_t Kernel,
+                                           const size_t *GlobalWorkSize,
+                                           ol_event_handle_t *EventOut) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Queue) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == Kernel) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == GlobalWorkSize) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+  }
+
+  return olEnqueueKernelLaunch_impl(Queue, Kernel, GlobalWorkSize, EventOut);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueKernelLaunch(
+    ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
+    const size_t *GlobalWorkSize, ol_event_handle_t *EventOut) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olEnqueueKernelLaunch";
+  }
+
+  ol_result_t Result =
+      olEnqueueKernelLaunch_val(Queue, Kernel, GlobalWorkSize, EventOut);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_enqueue_kernel_launch_params_t Params = {&Queue, &Kernel,
+                                                &GlobalWorkSize, &EventOut};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olEnqueueKernelLaunchWithCodeLoc(ol_queue_handle_t Queue,
+                                             ol_kernel_handle_t Kernel,
+                                             const size_t *GlobalWorkSize,
+                                             ol_event_handle_t *EventOut,
+                                             ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result =
+      olEnqueueKernelLaunch(Queue, Kernel, GlobalWorkSize, EventOut);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olCreateProgram_val(ol_device_handle_t Device, void *ProgData,
+                                     size_t ProgDataSize,
+                                     ol_program_handle_t *Queue) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Device) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == ProgData) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+
+    if (NULL == Queue) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+  }
+
+  return olCreateProgram_impl(Device, ProgData, ProgDataSize, Queue);
+}
+OL_APIEXPORT ol_result_t OL_APICALL
+olCreateProgram(ol_device_handle_t Device, void *ProgData, size_t ProgDataSize,
+                ol_program_handle_t *Queue) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olCreateProgram";
+  }
+
+  ol_result_t Result =
+      olCreateProgram_val(Device, ProgData, ProgDataSize, Queue);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_create_program_params_t Params = {&Device, &ProgData, &ProgDataSize,
+                                         &Queue};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olCreateProgramWithCodeLoc(ol_device_handle_t Device,
+                                       void *ProgData, size_t ProgDataSize,
+                                       ol_program_handle_t *Queue,
+                                       ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olCreateProgram(Device, ProgData, ProgDataSize, Queue);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olRetainProgram_val(ol_program_handle_t Program) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Program) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+  }
+
+  return olRetainProgram_impl(Program);
+}
+OL_APIEXPORT ol_result_t OL_APICALL
+olRetainProgram(ol_program_handle_t Program) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olRetainProgram";
+  }
+
+  ol_result_t Result = olRetainProgram_val(Program);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_retain_program_params_t Params = {&Program};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olRetainProgramWithCodeLoc(ol_program_handle_t Program,
+                                       ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olRetainProgram(Program);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olReleaseProgram_val(ol_program_handle_t Program) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Program) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+  }
+
+  return olReleaseProgram_impl(Program);
+}
+OL_APIEXPORT ol_result_t OL_APICALL
+olReleaseProgram(ol_program_handle_t Program) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olReleaseProgram";
+  }
+
+  ol_result_t Result = olReleaseProgram_val(Program);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_release_program_params_t Params = {&Program};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olReleaseProgramWithCodeLoc(ol_program_handle_t Program,
+                                        ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olReleaseProgram(Program);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olCreateKernel_val(ol_program_handle_t Program,
+                                    const char *KernelName,
+                                    ol_kernel_handle_t *Kernel) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Program) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == KernelName) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+
+    if (NULL == Kernel) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+  }
+
+  return olCreateKernel_impl(Program, KernelName, Kernel);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olCreateKernel(ol_program_handle_t Program,
+                                                   const char *KernelName,
+                                                   ol_kernel_handle_t *Kernel) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olCreateKernel";
+  }
+
+  ol_result_t Result = olCreateKernel_val(Program, KernelName, Kernel);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_create_kernel_params_t Params = {&Program, &KernelName, &Kernel};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olCreateKernelWithCodeLoc(ol_program_handle_t Program,
+                                      const char *KernelName,
+                                      ol_kernel_handle_t *Kernel,
+                                      ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olCreateKernel(Program, KernelName, Kernel);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olRetainKernel_val(ol_kernel_handle_t Kernel) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Kernel) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+  }
+
+  return olRetainKernel_impl(Kernel);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olRetainKernel(ol_kernel_handle_t Kernel) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olRetainKernel";
+  }
+
+  ol_result_t Result = olRetainKernel_val(Kernel);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_retain_kernel_params_t Params = {&Kernel};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olRetainKernelWithCodeLoc(ol_kernel_handle_t Kernel,
+                                      ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olRetainKernel(Kernel);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olReleaseKernel_val(ol_kernel_handle_t Kernel) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Kernel) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+  }
+
+  return olReleaseKernel_impl(Kernel);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olReleaseKernel(ol_kernel_handle_t Kernel) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olReleaseKernel";
+  }
+
+  ol_result_t Result = olReleaseKernel_val(Kernel);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_release_kernel_params_t Params = {&Kernel};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olReleaseKernelWithCodeLoc(ol_kernel_handle_t Kernel,
+                                       ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olReleaseKernel(Kernel);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olSetKernelArgValue_val(ol_kernel_handle_t Kernel,
+                                         uint32_t Index, size_t Size,
+                                         void *ArgData) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Kernel) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == ArgData) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+  }
+
+  return olSetKernelArgValue_impl(Kernel, Index, Size, ArgData);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgValue(
+    ol_kernel_handle_t Kernel, uint32_t Index, size_t Size, void *ArgData) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olSetKernelArgValue";
+  }
+
+  ol_result_t Result = olSetKernelArgValue_val(Kernel, Index, Size, ArgData);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_set_kernel_arg_value_params_t Params = {&Kernel, &Index, &Size,
+                                               &ArgData};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olSetKernelArgValueWithCodeLoc(ol_kernel_handle_t Kernel,
+                                           uint32_t Index, size_t Size,
+                                           void *ArgData,
+                                           ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olSetKernelArgValue(Kernel, Index, Size, ArgData);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
diff --git a/offload/liboffload/include/generated/OffloadFuncs.inc b/offload/liboffload/include/generated/OffloadFuncs.inc
index 26120f18279dc..6f2bb34599a1d 100644
--- a/offload/liboffload/include/generated/OffloadFuncs.inc
+++ b/offload/liboffload/include/generated/OffloadFuncs.inc
@@ -22,6 +22,24 @@ OFFLOAD_FUNC(olGetDeviceInfo)
 OFFLOAD_FUNC(olGetDeviceInfoSize)
 OFFLOAD_FUNC(olMemAlloc)
 OFFLOAD_FUNC(olMemFree)
+OFFLOAD_FUNC(olCreateQueue)
+OFFLOAD_FUNC(olRetainQueue)
+OFFLOAD_FUNC(olReleaseQueue)
+OFFLOAD_FUNC(olFinishQueue)
+OFFLOAD_FUNC(olRetainEvent)
+OFFLOAD_FUNC(olReleaseEvent)
+OFFLOAD_FUNC(olWaitEvent)
+OFFLOAD_FUNC(olEnqueueDataWrite)
+OFFLOAD_FUNC(olEnqueueDataRead)
+OFFLOAD_FUNC(olEnqueueDataCopy)
+OFFLOAD_FUNC(olEnqueueKernelLaunch)
+OFFLOAD_FUNC(olCreateProgram)
+OFFLOAD_FUNC(olRetainProgram)
+OFFLOAD_FUNC(olReleaseProgram)
+OFFLOAD_FUNC(olCreateKernel)
+OFFLOAD_FUNC(olRetainKernel)
+OFFLOAD_FUNC(olReleaseKernel)
+OFFLOAD_FUNC(olSetKernelArgValue)
 OFFLOAD_FUNC(olInitWithCodeLoc)
 OFFLOAD_FUNC(olShutDownWithCodeLoc)
 OFFLOAD_FUNC(olGetPlatformWithCodeLoc)
@@ -34,5 +52,23 @@ OFFLOAD_FUNC(olGetDeviceInfoWithCodeLoc)
 OFFLOAD_FUNC(olGetDeviceInfoSizeWithCodeLoc)
 OFFLOAD_FUNC(olMemAllocWithCodeLoc)
 OFFLOAD_FUNC(olMemFreeWithCodeLoc)
+OFFLOAD_FUNC(olCreateQueueWithCodeLoc)
+OFFLOAD_FUNC(olRetainQueueWithCodeLoc)
+OFFLOAD_FUNC(olReleaseQueueWithCodeLoc)
+OFFLOAD_FUNC(olFinishQueueWithCodeLoc)
+OFFLOAD_FUNC(olRetainEventWithCodeLoc)
+OFFLOAD_FUNC(olReleaseEventWithCodeLoc)
+OFFLOAD_FUNC(olWaitEventWithCodeLoc)
+OFFLOAD_FUNC(olEnqueueDataWriteWithCodeLoc)
+OFFLOAD_FUNC(olEnqueueDataReadWithCodeLoc)
+OFFLOAD_FUNC(olEnqueueDataCopyWithCodeLoc)
+OFFLOAD_FUNC(olEnqueueKernelLaunchWithCodeLoc)
+OFFLOAD_FUNC(olCreateProgramWithCodeLoc)
+OFFLOAD_FUNC(olRetainProgramWithCodeLoc)
+OFFLOAD_FUNC(olReleaseProgramWithCodeLoc)
+OFFLOAD_FUNC(olCreateKernelWithCodeLoc)
+OFFLOAD_FUNC(olRetainKernelWithCodeLoc)
+OFFLOAD_FUNC(olReleaseKernelWithCodeLoc)
+OFFLOAD_FUNC(olSetKernelArgValueWithCodeLoc)
 
 #undef OFFLOAD_FUNC
diff --git a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
index f0a96081fd243..9d21d8fc97090 100644
--- a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
+++ b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
@@ -43,3 +43,57 @@ ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device,
 
 ol_impl_result_t olMemFree_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
                                 void *Address);
+
+ol_impl_result_t olCreateQueue_impl(ol_device_handle_t Device,
+                                    ol_queue_handle_t *Queue);
+
+ol_impl_result_t olRetainQueue_impl(ol_queue_handle_t Queue);
+
+ol_impl_result_t olReleaseQueue_impl(ol_queue_handle_t Queue);
+
+ol_impl_result_t olFinishQueue_impl(ol_queue_handle_t Queue);
+
+ol_impl_result_t olRetainEvent_impl(ol_event_handle_t Event);
+
+ol_impl_result_t olReleaseEvent_impl(ol_event_handle_t Event);
+
+ol_impl_result_t olWaitEvent_impl(ol_event_handle_t Event);
+
+ol_impl_result_t olEnqueueDataWrite_impl(ol_queue_handle_t Queue, void *SrcPtr,
+                                         void *DstPtr, size_t Size,
+                                         ol_event_handle_t *EventOut);
+
+ol_impl_result_t olEnqueueDataRead_impl(ol_queue_handle_t Queue, void *SrcPtr,
+                                        void *DstPtr, size_t Size,
+                                        ol_event_handle_t *EventOut);
+
+ol_impl_result_t olEnqueueDataCopy_impl(ol_queue_handle_t Queue, void *SrcPtr,
+                                        void *DstPtr,
+                                        ol_device_handle_t DstDevice,
+                                        size_t Size,
+                                        ol_event_handle_t *EventOut);
+
+ol_impl_result_t olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue,
+                                            ol_kernel_handle_t Kernel,
+                                            const size_t *GlobalWorkSize,
+                                            ol_event_handle_t *EventOut);
+
+ol_impl_result_t olCreateProgram_impl(ol_device_handle_t Device, void *ProgData,
+                                      size_t ProgDataSize,
+                                      ol_program_handle_t *Queue);
+
+ol_impl_result_t olRetainProgram_impl(ol_program_handle_t Program);
+
+ol_impl_result_t olReleaseProgram_impl(ol_program_handle_t Program);
+
+ol_impl_result_t olCreateKernel_impl(ol_program_handle_t Program,
+                                     const char *KernelName,
+                                     ol_kernel_handle_t *Kernel);
+
+ol_impl_result_t olRetainKernel_impl(ol_kernel_handle_t Kernel);
+
+ol_impl_result_t olReleaseKernel_impl(ol_kernel_handle_t Kernel);
+
+ol_impl_result_t olSetKernelArgValue_impl(ol_kernel_handle_t Kernel,
+                                          uint32_t Index, size_t Size,
+                                          void *ArgData);
diff --git a/offload/liboffload/include/generated/OffloadPrint.hpp b/offload/liboffload/include/generated/OffloadPrint.hpp
index cff754237568e..698b422fc38d0 100644
--- a/offload/liboffload/include/generated/OffloadPrint.hpp
+++ b/offload/liboffload/include/generated/OffloadPrint.hpp
@@ -22,6 +22,10 @@ template <typename T> struct is_handle : std::false_type {};
 template <> struct is_handle<ol_platform_handle_t> : std::true_type {};
 template <> struct is_handle<ol_device_handle_t> : std::true_type {};
 template <> struct is_handle<ol_context_handle_t> : std::true_type {};
+template <> struct is_handle<ol_queue_handle_t> : std::true_type {};
+template <> struct is_handle<ol_event_handle_t> : std::true_type {};
+template <> struct is_handle<ol_program_handle_t> : std::true_type {};
+template <> struct is_handle<ol_kernel_handle_t> : std::true_type {};
 template <typename T> inline constexpr bool is_handle_v = is_handle<T>::value;
 
 inline std::ostream &operator<<(std::ostream &os, enum ol_errc_t value);
@@ -455,6 +459,212 @@ inline std::ostream &operator<<(std::ostream &os,
   return os;
 }
 
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ol_create_queue_params_t *params) {
+  os << ".Device = ";
+  printPtr(os, *params->pDevice);
+  os << ", ";
+  os << ".Queue = ";
+  printPtr(os, *params->pQueue);
+  return os;
+}
+
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ol_retain_queue_params_t *params) {
+  os << ".Queue = ";
+  printPtr(os, *params->pQueue);
+  return os;
+}
+
+inline std::ostream &
+operator<<(std::ostream &os, const struct ol_release_queue_params_t *params) {
+  os << ".Queue = ";
+  printPtr(os, *params->pQueue);
+  return os;
+}
+
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ol_finish_queue_params_t *params) {
+  os << ".Queue = ";
+  printPtr(os, *params->pQueue);
+  return os;
+}
+
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ol_retain_event_params_t *params) {
+  os << ".Event = ";
+  printPtr(os, *params->pEvent);
+  return os;
+}
+
+inline std::ostream &
+operator<<(std::ostream &os, const struct ol_release_event_params_t *params) {
+  os << ".Event = ";
+  printPtr(os, *params->pEvent);
+  return os;
+}
+
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ol_wait_event_params_t *params) {
+  os << ".Event = ";
+  printPtr(os, *params->pEvent);
+  return os;
+}
+
+inline std::ostream &
+operator<<(std::ostream &os,
+           const struct ol_enqueue_data_write_params_t *params) {
+  os << ".Queue = ";
+  printPtr(os, *params->pQueue);
+  os << ", ";
+  os << ".SrcPtr = ";
+  printPtr(os, *params->pSrcPtr);
+  os << ", ";
+  os << ".DstPtr = ";
+  printPtr(os, *params->pDstPtr);
+  os << ", ";
+  os << ".Size = ";
+  os << *params->pSize;
+  os << ", ";
+  os << ".EventOut = ";
+  printPtr(os, *params->pEventOut);
+  return os;
+}
+
+inline std::ostream &
+operator<<(std::ostream &os,
+           const struct ol_enqueue_data_read_params_t *params) {
+  os << ".Queue = ";
+  printPtr(os, *params->pQueue);
+  os << ", ";
+  os << ".SrcPtr = ";
+  printPtr(os, *params->pSrcPtr);
+  os << ", ";
+  os << ".DstPtr = ";
+  printPtr(os, *params->pDstPtr);
+  os << ", ";
+  os << ".Size = ";
+  os << *params->pSize;
+  os << ", ";
+  os << ".EventOut = ";
+  printPtr(os, *params->pEventOut);
+  return os;
+}
+
+inline std::ostream &
+operator<<(std::ostream &os,
+           const struct ol_enqueue_data_copy_params_t *params) {
+  os << ".Queue = ";
+  printPtr(os, *params->pQueue);
+  os << ", ";
+  os << ".SrcPtr = ";
+  printPtr(os, *params->pSrcPtr);
+  os << ", ";
+  os << ".DstPtr = ";
+  printPtr(os, *params->pDstPtr);
+  os << ", ";
+  os << ".DstDevice = ";
+  printPtr(os, *params->pDstDevice);
+  os << ", ";
+  os << ".Size = ";
+  os << *params->pSize;
+  os << ", ";
+  os << ".EventOut = ";
+  printPtr(os, *params->pEventOut);
+  return os;
+}
+
+inline std::ostream &
+operator<<(std::ostream &os,
+           const struct ol_enqueue_kernel_launch_params_t *params) {
+  os << ".Queue = ";
+  printPtr(os, *params->pQueue);
+  os << ", ";
+  os << ".Kernel = ";
+  printPtr(os, *params->pKernel);
+  os << ", ";
+  os << ".GlobalWorkSize = ";
+  printPtr(os, *params->pGlobalWorkSize);
+  os << ", ";
+  os << ".EventOut = ";
+  printPtr(os, *params->pEventOut);
+  return os;
+}
+
+inline std::ostream &
+operator<<(std::ostream &os, const struct ol_create_program_params_t *params) {
+  os << ".Device = ";
+  printPtr(os, *params->pDevice);
+  os << ", ";
+  os << ".ProgData = ";
+  printPtr(os, *params->pProgData);
+  os << ", ";
+  os << ".ProgDataSize = ";
+  os << *params->pProgDataSize;
+  os << ", ";
+  os << ".Queue = ";
+  printPtr(os, *params->pQueue);
+  return os;
+}
+
+inline std::ostream &
+operator<<(std::ostream &os, const struct ol_retain_program_params_t *params) {
+  os << ".Program = ";
+  printPtr(os, *params->pProgram);
+  return os;
+}
+
+inline std::ostream &
+operator<<(std::ostream &os, const struct ol_release_program_params_t *params) {
+  os << ".Program = ";
+  printPtr(os, *params->pProgram);
+  return os;
+}
+
+inline std::ostream &
+operator<<(std::ostream &os, const struct ol_create_kernel_params_t *params) {
+  os << ".Program = ";
+  printPtr(os, *params->pProgram);
+  os << ", ";
+  os << ".KernelName = ";
+  printPtr(os, *params->pKernelName);
+  os << ", ";
+  os << ".Kernel = ";
+  printPtr(os, *params->pKernel);
+  return os;
+}
+
+inline std::ostream &
+operator<<(std::ostream &os, const struct ol_retain_kernel_params_t *params) {
+  os << ".Kernel = ";
+  printPtr(os, *params->pKernel);
+  return os;
+}
+
+inline std::ostream &
+operator<<(std::ostream &os, const struct ol_release_kernel_params_t *params) {
+  os << ".Kernel = ";
+  printPtr(os, *params->pKernel);
+  return os;
+}
+
+inline std::ostream &
+operator<<(std::ostream &os,
+           const struct ol_set_kernel_arg_value_params_t *params) {
+  os << ".Kernel = ";
+  printPtr(os, *params->pKernel);
+  os << ", ";
+  os << ".Index = ";
+  os << *params->pIndex;
+  os << ", ";
+  os << ".Size = ";
+  os << *params->pSize;
+  os << ", ";
+  os << ".ArgData = ";
+  printPtr(os, *params->pArgData);
+  return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // @brief Print pointer value
 template <typename T>
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 3e609ed03917f..d1c72ecced875 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -34,6 +34,117 @@ struct ol_platform_handle_t_ {
   std::vector<ol_device_handle_t_> Devices;
 };
 
+struct ol_queue_handle_t_ {
+  __tgt_async_info *AsyncInfo;
+  ol_device_handle_t Device;
+  std::atomic_uint32_t RefCount;
+};
+
+struct ol_event_handle_t_ {
+  void *EventInfo;
+  ol_queue_handle_t Queue;
+  ol_device_handle_t Device;
+  std::atomic_uint32_t RefCount;
+};
+
+struct ol_program_handle_t_ {
+  llvm::omp::target::plugin::DeviceImageTy *Image;
+  std::atomic_uint32_t RefCount;
+};
+
+struct OffloadArguments {
+  static constexpr size_t MaxParamBytes = 4000u;
+  using args_t = std::array<char, MaxParamBytes>;
+  using args_size_t = std::vector<size_t>;
+  using args_index_t = std::vector<void *>;
+  args_t Storage;
+  args_size_t ParamSizes;
+  args_index_t Indices;
+  args_size_t OffsetPerIndex;
+
+  std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0};
+
+  // NOTE:
+  // This implementation is an exact copy of the CUDA adapter's argument
+  // implementation. Even though it was designed for CUDA, the design of
+  // libomptarget means it should work for other plugins as they will expect
+  // the same argument layout.
+  OffloadArguments() {
+    // Place the implicit offset index at the end of the indicies collection
+    Indices.emplace_back(&ImplicitOffsetArgs);
+  }
+
+  /// Add an argument to the kernel.
+  /// If the argument existed before, it is replaced.
+  /// Otherwise, it is added.
+  /// Gaps are filled with empty arguments.
+  /// Implicit offset argument is kept at the back of the indices collection.
+  void addArg(size_t Index, size_t Size, const void *Arg,
+              size_t LocalSize = 0) {
+    if (Index + 2 > Indices.size()) {
+      // Move implicit offset argument index with the end
+      Indices.resize(Index + 2, Indices.back());
+      // Ensure enough space for the new argument
+      ParamSizes.resize(Index + 1);
+      OffsetPerIndex.resize(Index + 1);
+    }
+    ParamSizes[Index] = Size;
+    // calculate the insertion point on the array
+    size_t InsertPos = std::accumulate(std::begin(ParamSizes),
+                                       std::begin(ParamSizes) + Index, 0);
+    // Update the stored value for the argument
+    std::memcpy(&Storage[InsertPos], Arg, Size);
+    Indices[Index] = &Storage[InsertPos];
+    OffsetPerIndex[Index] = LocalSize;
+  }
+
+  void addLocalArg(size_t Index, size_t Size) {
+    size_t LocalOffset = this->getLocalSize();
+
+    // maximum required alignment is the size of the largest vector type
+    const size_t MaxAlignment = sizeof(double) * 16;
+
+    // for arguments smaller than the maximum alignment simply align to the
+    // size of the argument
+    const size_t Alignment = std::min(MaxAlignment, Size);
+
+    // align the argument
+    size_t AlignedLocalOffset = LocalOffset;
+    size_t Pad = LocalOffset % Alignment;
+    if (Pad != 0) {
+      AlignedLocalOffset += Alignment - Pad;
+    }
+
+    addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
+           Size + (AlignedLocalOffset - LocalOffset));
+  }
+
+  void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
+    assert(Size == sizeof(std::uint32_t) * 3);
+    std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
+  }
+
+  void clearLocalSize() {
+    std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
+  }
+
+  const args_index_t &getIndices() const noexcept { return Indices; }
+
+  uint32_t getLocalSize() const {
+    return std::accumulate(std::begin(OffsetPerIndex), std::end(OffsetPerIndex),
+                           0);
+  }
+
+  const char *getStorage() const noexcept { return Storage.data(); }
+};
+
+struct ol_kernel_handle_t_ {
+  ol_program_handle_t Program;
+  std::atomic_uint32_t RefCount;
+  GenericKernelTy *KernelImpl;
+  OffloadArguments Args;
+};
+
 using PlatformVecT = SmallVector<ol_platform_handle_t_, 4>;
 PlatformVecT &Platforms() {
   static PlatformVecT Platforms;
@@ -280,3 +391,252 @@ ol_impl_result_t olMemFree_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
   }
   return OL_SUCCESS;
 }
+
+ol_impl_result_t olCreateQueue_impl(ol_device_handle_t Device,
+                                    ol_queue_handle_t *Queue) {
+  auto CreatedQueue = std::make_unique<ol_queue_handle_t_>();
+  auto Err = Device->Device.initAsyncInfo(&(CreatedQueue->AsyncInfo));
+  if (Err) {
+    return OL_ERRC_OUT_OF_RESOURCES;
+  }
+  // TODO: Check error
+  CreatedQueue->Device = Device;
+  CreatedQueue->RefCount = 1;
+  *Queue = CreatedQueue.release();
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olRetainQueue_impl(ol_queue_handle_t Queue) {
+  Queue->RefCount++;
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olReleaseQueue_impl(ol_queue_handle_t Queue) {
+  Queue->RefCount--;
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olFinishQueue_impl(ol_queue_handle_t Queue) {
+  // Host plugin doesn't have a queue set so it's not safe to call synchronize
+  // on it, but we have nothing to synchronize in that situation anyway.
+  if (Queue->AsyncInfo->Queue) {
+    auto Err = Queue->Device->Device.synchronize(Queue->AsyncInfo);
+    if (Err) {
+      return OL_ERRC_OUT_OF_RESOURCES;
+    }
+  }
+
+  // Recreate the stream resource so the queue can be reused
+  // TODO: Would be easier for the synchronization to (optionally) not release
+  // it to begin with.
+  auto Res = Queue->Device->Device.initAsyncInfo(&Queue->AsyncInfo);
+  if (Res) {
+    return OL_ERRC_OUT_OF_RESOURCES;
+  }
+
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olWaitEvent_impl(ol_event_handle_t Event) {
+  auto Res = Event->Device->Device.syncEvent(Event->EventInfo);
+  if (Res) {
+    return OL_ERRC_OUT_OF_RESOURCES;
+  }
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olRetainEvent_impl(ol_event_handle_t Event) {
+  Event->RefCount++;
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olReleaseEvent_impl(ol_event_handle_t Event) {
+  Event->RefCount--;
+  return OL_SUCCESS;
+}
+
+ol_event_handle_t makeEvent(ol_queue_handle_t Queue) {
+  auto EventImpl = std::make_unique<ol_event_handle_t_>();
+  EventImpl->Queue = Queue;
+  auto Res = Queue->Device->Device.createEvent(&EventImpl->EventInfo);
+  if (Res) {
+    return nullptr;
+  }
+  Res =
+      Queue->Device->Device.recordEvent(EventImpl->EventInfo, Queue->AsyncInfo);
+  if (Res) {
+    return nullptr;
+  }
+
+  return EventImpl.release();
+}
+
+ol_impl_result_t olEnqueueDataWrite_impl(ol_queue_handle_t Queue, void *SrcPtr,
+                                         void *DstPtr, size_t Size,
+                                         ol_event_handle_t *EventOut) {
+  auto &DeviceImpl = Queue->Device->Device;
+
+  auto Res = DeviceImpl.dataSubmit(DstPtr, SrcPtr, Size, Queue->AsyncInfo);
+
+  if (Res) {
+    return OL_ERRC_OUT_OF_RESOURCES;
+  }
+
+  if (EventOut) {
+    *EventOut = makeEvent(Queue);
+  }
+
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olEnqueueDataRead_impl(ol_queue_handle_t Queue, void *SrcPtr,
+                                        void *DstPtr, size_t Size,
+                                        ol_event_handle_t *EventOut) {
+  auto &DeviceImpl = Queue->Device->Device;
+
+  auto Res = DeviceImpl.dataRetrieve(DstPtr, SrcPtr, Size, Queue->AsyncInfo);
+
+  if (Res) {
+    return OL_ERRC_OUT_OF_RESOURCES;
+  }
+
+  if (EventOut) {
+    *EventOut = makeEvent(Queue);
+  }
+
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olEnqueueDataCopy_impl(ol_queue_handle_t Queue, void *SrcPtr,
+                                        void *DstPtr,
+                                        ol_device_handle_t DstDevice,
+                                        size_t Size,
+                                        ol_event_handle_t *EventOut) {
+  auto &DeviceImpl = Queue->Device->Device;
+
+  auto Res = DeviceImpl.dataExchange(SrcPtr, DstDevice->Device, DstPtr, Size,
+                                     Queue->AsyncInfo);
+
+  if (Res) {
+    return OL_ERRC_OUT_OF_RESOURCES;
+  }
+
+  if (EventOut) {
+    *EventOut = makeEvent(Queue);
+  }
+
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olCreateProgram_impl(ol_device_handle_t Device, void *ProgData,
+                                      size_t ProgDataSize,
+                                      ol_program_handle_t *Program) {
+
+  __tgt_device_image DeviceImage{
+      ProgData, ((char *)ProgData) + ProgDataSize - 1, nullptr, nullptr};
+
+  auto Res = Device->Device.loadBinary(Device->Device.Plugin, &DeviceImage);
+  if (!Res)
+    return OL_ERRC_INVALID_VALUE;
+
+  ol_program_handle_t Prog = new ol_program_handle_t_();
+  Prog->Image = *Res;
+  Prog->RefCount = 1;
+  *Program = Prog;
+
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olRetainProgram_impl(ol_program_handle_t Program) {
+  ++Program->RefCount;
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olReleaseProgram_impl(ol_program_handle_t Program) {
+  if (--Program->RefCount == 0) {
+    delete Program;
+  }
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olCreateKernel_impl(ol_program_handle_t Program,
+                                     const char *KernelName,
+                                     ol_kernel_handle_t *Kernel) {
+
+  auto &Device = Program->Image->getDevice();
+  auto KernelImpl = Device.constructKernel(KernelName);
+  if (!KernelImpl) {
+    return OL_ERRC_OUT_OF_RESOURCES;
+  }
+
+  auto Err = KernelImpl->init(Device, *Program->Image);
+  if (Err) {
+    return OL_ERRC_OUT_OF_RESOURCES;
+  }
+
+  ol_kernel_handle_t CreatedKernel = new ol_kernel_handle_t_();
+  CreatedKernel->Program = Program;
+  CreatedKernel->RefCount = 1;
+  CreatedKernel->KernelImpl = &*KernelImpl;
+  *Kernel = CreatedKernel;
+
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olRetainKernel_impl(ol_kernel_handle_t Kernel) {
+  Kernel->RefCount++;
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olReleaseKernel_impl(ol_kernel_handle_t Kernel) {
+  if (--Kernel->RefCount == 0) {
+    delete Kernel;
+  }
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olSetKernelArgValue_impl(ol_kernel_handle_t Kernel,
+                                          uint32_t Index, size_t Size,
+                                          void *ArgData) {
+  Kernel->Args.addArg(Index, Size, ArgData);
+
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue,
+                                            ol_kernel_handle_t Kernel,
+                                            const size_t *GlobalWorkSize,
+                                            ol_event_handle_t *EventOut) {
+  auto &DeviceImpl = Queue->Device->Device;
+
+  AsyncInfoWrapperTy AsyncInfoWrapper(DeviceImpl, Queue->AsyncInfo);
+
+  KernelArgsTy LaunchArgs{};
+  LaunchArgs.NumArgs = Kernel->Args.getIndices().size() - 1; // TODO
+  LaunchArgs.NumTeams[0] = GlobalWorkSize[0];
+  LaunchArgs.NumTeams[1] = 1;
+  LaunchArgs.NumTeams[2] = 1;
+  LaunchArgs.ThreadLimit[0] = 1;
+  LaunchArgs.ThreadLimit[1] = 1;
+  LaunchArgs.ThreadLimit[2] = 1;
+
+  LaunchArgs.ArgPtrs = (void **)Kernel->Args.getStorage();
+
+  // TODO: Verify this
+  auto ArgOffsets = std::vector<ptrdiff_t>(LaunchArgs.NumArgs, 0ul);
+
+  auto Err = Kernel->KernelImpl->launch(
+      DeviceImpl, (void **)Kernel->Args.getStorage(), ArgOffsets.data(),
+      LaunchArgs, AsyncInfoWrapper);
+
+  AsyncInfoWrapper.finalize(Err);
+  if (Err) {
+    return OL_ERRC_OUT_OF_RESOURCES;
+  }
+
+  if (EventOut) {
+    *EventOut = makeEvent(Queue);
+  }
+
+  return OL_SUCCESS;
+}
diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h
index d2914e7cd0eb4..d65fceb8508d2 100644
--- a/offload/plugins-nextgen/common/include/GlobalHandler.h
+++ b/offload/plugins-nextgen/common/include/GlobalHandler.h
@@ -131,8 +131,9 @@ class GenericGlobalHandlerTy {
 
   /// Get the address and size of a global in the image. Address and size are
   /// return in \p ImageGlobal, the global name is passed in \p ImageGlobal.
-  Error getGlobalMetadataFromImage(GenericDeviceTy &Device,
-                                   DeviceImageTy &Image, GlobalTy &ImageGlobal);
+  virtual Error getGlobalMetadataFromImage(GenericDeviceTy &Device,
+                                           DeviceImageTy &Image,
+                                           GlobalTy &ImageGlobal);
 
   /// Read the memory associated with a global from the image and store it on
   /// the host. The name, size, and destination are defined by \p HostGlobal.
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 894d1c2214b97..d7a69091ada74 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1327,6 +1327,34 @@ class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy {
     DeviceGlobal.setPtr(reinterpret_cast<void *>(CUPtr));
     return Plugin::success();
   }
+
+  Error getGlobalMetadataFromImage(GenericDeviceTy &Device,
+                                   DeviceImageTy &Image,
+                                   GlobalTy &ImageGlobal) override {
+    // If the image is an ELF we can use the generic path, otherwise fall back
+    // and use cuModuleGetGlobal to query the image.
+    if (utils::elf::isELF(Image.getMemoryBuffer().getBuffer())) {
+      return GenericGlobalHandlerTy::getGlobalMetadataFromImage(Device, Image,
+                                                                ImageGlobal);
+    }
+
+    CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
+
+    const char *GlobalName = ImageGlobal.getName().data();
+
+    size_t CUSize;
+    CUdeviceptr CUPtr;
+    CUresult Res =
+        cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), GlobalName);
+    if (auto Err = Plugin::check(Res, "Error in cuModuleGetGlobal for '%s': %s",
+                                 GlobalName))
+      return Err;
+
+    // Setup the global symbol's address and size.
+    ImageGlobal.setPtr(reinterpret_cast<void *>(CUPtr));
+    ImageGlobal.setSize(CUSize);
+    return Plugin::success();
+  }
 };
 
 /// Class implementing the CUDA-specific functionalities of the plugin.
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 915c41e88c582..1ba9a49f4f9af 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -287,9 +287,9 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     return Plugin::success();
   }
 
-  /// This plugin does not support interoperability
+  /// This plugin does not support interoperability, do nothing
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    return Plugin::error("initAsyncInfoImpl not supported");
+    return Plugin::success();
   }
 
   /// This plugin does not support interoperability
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 033ee2b6ec746..e0d790684898d 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -10,7 +10,9 @@ add_libompt_unittest("offload.unittests"
     ${CMAKE_CURRENT_SOURCE_DIR}/device/olGetDevice.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/device/olGetDeviceCount.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/device/olGetDeviceInfo.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/device/olGetDeviceInfoSize.cpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/device/olGetDeviceInfoSize.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/queue/olCreateQueue.cpp
+    )
 add_dependencies("offload.unittests" ${PLUGINS_TEST_COMMON})
 target_link_libraries("offload.unittests" PRIVATE ${PLUGINS_TEST_COMMON})
 target_include_directories("offload.unittests" PRIVATE ${PLUGINS_TEST_INCLUDE})
diff --git a/offload/unittests/OffloadAPI/queue/olCreateQueue.cpp b/offload/unittests/OffloadAPI/queue/olCreateQueue.cpp
new file mode 100644
index 0000000000000..f542dac4bb2d8
--- /dev/null
+++ b/offload/unittests/OffloadAPI/queue/olCreateQueue.cpp
@@ -0,0 +1,19 @@
+//===------- Offload API tests - olCreateQueue ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olCreateQueueTest = offloadDeviceTest;
+
+TEST_F(olCreateQueueTest, Success) {
+  ol_queue_handle_t Queue = nullptr;
+  ASSERT_SUCCESS(olCreateQueue(Device, &Queue));
+  ASSERT_NE(Queue, nullptr);
+}

>From f6430fef9a169a19752b7af11e2adccc701d8325 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Thu, 30 Jan 2025 15:46:15 +0000
Subject: [PATCH 04/17] Make a copy of the program binary in olCreateProgram

---
 offload/liboffload/src/OffloadImpl.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index d1c72ecced875..1fbb424021eef 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -49,6 +49,7 @@ struct ol_event_handle_t_ {
 
 struct ol_program_handle_t_ {
   llvm::omp::target::plugin::DeviceImageTy *Image;
+  std::unique_ptr<MemoryBuffer> ImageData;
   std::atomic_uint32_t RefCount;
 };
 
@@ -531,17 +532,22 @@ ol_impl_result_t olEnqueueDataCopy_impl(ol_queue_handle_t Queue, void *SrcPtr,
 ol_impl_result_t olCreateProgram_impl(ol_device_handle_t Device, void *ProgData,
                                       size_t ProgDataSize,
                                       ol_program_handle_t *Program) {
+  auto ImageData = MemoryBuffer::getMemBufferCopy(
+      StringRef(reinterpret_cast<char *>(ProgData), ProgDataSize));
+  __tgt_device_image DeviceImage{(char *) ImageData->getBuffer().data(),
+                                 ((char *)ImageData->getBuffer().data()) +
+                                     ProgDataSize - 1,
+                                 nullptr, nullptr};
 
-  __tgt_device_image DeviceImage{
-      ProgData, ((char *)ProgData) + ProgDataSize - 1, nullptr, nullptr};
+  ol_program_handle_t Prog = new ol_program_handle_t_();
 
   auto Res = Device->Device.loadBinary(Device->Device.Plugin, &DeviceImage);
   if (!Res)
     return OL_ERRC_INVALID_VALUE;
 
-  ol_program_handle_t Prog = new ol_program_handle_t_();
   Prog->Image = *Res;
   Prog->RefCount = 1;
+  Prog->ImageData = std::move(ImageData);
   *Program = Prog;
 
   return OL_SUCCESS;

>From fb8a1cca359a8a5c35eda4c806e90c0799a067a1 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Mon, 3 Feb 2025 10:28:41 +0000
Subject: [PATCH 05/17] Rework kernel arguments

---
 offload/liboffload/API/Kernel.td              | 26 +++++-
 .../liboffload/include/generated/OffloadAPI.h | 50 +++++++++-
 .../include/generated/OffloadEntryPoints.inc  | 44 +++++++++
 .../include/generated/OffloadFuncs.inc        |  2 +
 .../generated/OffloadImplFuncDecls.inc        |  3 +
 .../include/generated/OffloadPrint.hpp        | 14 +++
 offload/liboffload/src/OffloadImpl.cpp        | 91 +++++--------------
 7 files changed, 156 insertions(+), 74 deletions(-)

diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 936372c18ca37..4c8c84e9c71de 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -12,7 +12,7 @@ def : Function {
 
 def : Function {
     let name = "olRetainKernel";
-    let desc = "Create a queue for the given device";
+    let desc = "Increment the reference count of the given kernel";
     let details = [];
     let params = [
         Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>
@@ -22,7 +22,7 @@ def : Function {
 
 def : Function {
     let name = "olReleaseKernel";
-    let desc = "Create a queue for the given device";
+    let desc = "Decrement the reference count of the given kernel";
     let details = [];
     let params = [
         Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>
@@ -32,8 +32,11 @@ def : Function {
 
 def : Function {
     let name = "olSetKernelArgValue";
-    let desc = "Create a queue for the given device";
-    let details = [];
+    let desc = "Set the value of a single kernel argument at the given index";
+    let details = [
+        "The implementation will construct and lay out the backing storage for the kernel arguments."
+        "The effects of calls to this function on a kernel are lost if olSetKernelArgsData is called."
+    ];
     let params = [
         Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
         Param<"uint32_t", "Index", "index of the argument", PARAM_IN>,
@@ -42,3 +45,18 @@ def : Function {
     ];
     let returns = [];
 }
+
+def : Function {
+    let name = "olSetKernelArgsData";
+    let desc = "Set the entire argument data for a kernel";
+    let details = [
+        "Previous calls to olSetKernelArgValue on the same kernel are invalidated by this function"
+        "The data pointed to by ArgsData is assumed to be laid out correctly according to the requirements of the backend API"
+    ];
+    let params = [
+        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"void*", "ArgsData", "pointer to the argument data", PARAM_IN>,
+        Param<"size_t", "ArgsDataSize", "size of the argument data", PARAM_IN>
+    ];
+    let returns = [];
+}
diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index 2384de19ae72e..155e31338c88b 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -838,7 +838,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olCreateKernel(
     ol_kernel_handle_t *Kernel);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Create a queue for the given device
+/// @brief Increment the reference count of the given kernel
 ///
 /// @details
 ///
@@ -854,7 +854,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olRetainKernel(
     ol_kernel_handle_t Kernel);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Create a queue for the given device
+/// @brief Decrement the reference count of the given kernel
 ///
 /// @details
 ///
@@ -870,9 +870,12 @@ OL_APIEXPORT ol_result_t OL_APICALL olReleaseKernel(
     ol_kernel_handle_t Kernel);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Create a queue for the given device
+/// @brief Set the value of a single kernel argument at the given index
 ///
 /// @details
+///    - The implementation will construct and lay out the backing storage for
+///    the kernel arguments.The effects of calls to this function on a kernel
+///    are lost if olSetKernelArgsData is called.
 ///
 /// @returns
 ///     - ::OL_RESULT_SUCCESS
@@ -892,6 +895,30 @@ OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgValue(
     // [in] pointer to the argument data
     void *ArgData);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set the entire argument data for a kernel
+///
+/// @details
+///    - Previous calls to olSetKernelArgValue on the same kernel are
+///    invalidated by this functionThe data pointed to by ArgsData is assumed to
+///    be laid out correctly according to the requirements of the backend API
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Kernel`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == ArgsData`
+OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgsData(
+    // [in] handle of the kernel
+    ol_kernel_handle_t Kernel,
+    // [in] pointer to the argument data
+    void *ArgsData,
+    // [in] size of the argument data
+    size_t ArgsDataSize);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for olGetPlatform
 /// @details Each entry is a pointer to the parameter passed to the function;
@@ -1133,6 +1160,15 @@ typedef struct ol_set_kernel_arg_value_params_t {
   void **pArgData;
 } ol_set_kernel_arg_value_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olSetKernelArgsData
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_set_kernel_args_data_params_t {
+  ol_kernel_handle_t *pKernel;
+  void **pArgsData;
+  size_t *pArgsDataSize;
+} ol_set_kernel_args_data_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Variant of olInit that also sets source code location information
 /// @details See also ::olInit
@@ -1358,6 +1394,14 @@ OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgValueWithCodeLoc(
     ol_kernel_handle_t Kernel, uint32_t Index, size_t Size, void *ArgData,
     ol_code_location_t *CodeLocation);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olSetKernelArgsData that also sets source code location
+/// information
+/// @details See also ::olSetKernelArgsData
+OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgsDataWithCodeLoc(
+    ol_kernel_handle_t Kernel, void *ArgsData, size_t ArgsDataSize,
+    ol_code_location_t *CodeLocation);
+
 #if defined(__cplusplus)
 } // extern "C"
 #endif
diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc
index 0ae3c36f95827..fd022795a5d40 100644
--- a/offload/liboffload/include/generated/OffloadEntryPoints.inc
+++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc
@@ -1311,3 +1311,47 @@ ol_result_t olSetKernelArgValueWithCodeLoc(ol_kernel_handle_t Kernel,
   currentCodeLocation() = nullptr;
   return Result;
 }
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olSetKernelArgsData_val(ol_kernel_handle_t Kernel,
+                                         void *ArgsData, size_t ArgsDataSize) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Kernel) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == ArgsData) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+  }
+
+  return olSetKernelArgsData_impl(Kernel, ArgsData, ArgsDataSize);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgsData(
+    ol_kernel_handle_t Kernel, void *ArgsData, size_t ArgsDataSize) {
+  if (offloadConfig().TracingEnabled) {
+    std::cout << "---> olSetKernelArgsData";
+  }
+
+  ol_result_t Result = olSetKernelArgsData_val(Kernel, ArgsData, ArgsDataSize);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_set_kernel_args_data_params_t Params = {&Kernel, &ArgsData,
+                                               &ArgsDataSize};
+    std::cout << "(" << &Params << ")";
+    std::cout << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cout << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olSetKernelArgsDataWithCodeLoc(ol_kernel_handle_t Kernel,
+                                           void *ArgsData, size_t ArgsDataSize,
+                                           ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olSetKernelArgsData(Kernel, ArgsData, ArgsDataSize);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
diff --git a/offload/liboffload/include/generated/OffloadFuncs.inc b/offload/liboffload/include/generated/OffloadFuncs.inc
index 6f2bb34599a1d..05a8e47251254 100644
--- a/offload/liboffload/include/generated/OffloadFuncs.inc
+++ b/offload/liboffload/include/generated/OffloadFuncs.inc
@@ -40,6 +40,7 @@ OFFLOAD_FUNC(olCreateKernel)
 OFFLOAD_FUNC(olRetainKernel)
 OFFLOAD_FUNC(olReleaseKernel)
 OFFLOAD_FUNC(olSetKernelArgValue)
+OFFLOAD_FUNC(olSetKernelArgsData)
 OFFLOAD_FUNC(olInitWithCodeLoc)
 OFFLOAD_FUNC(olShutDownWithCodeLoc)
 OFFLOAD_FUNC(olGetPlatformWithCodeLoc)
@@ -70,5 +71,6 @@ OFFLOAD_FUNC(olCreateKernelWithCodeLoc)
 OFFLOAD_FUNC(olRetainKernelWithCodeLoc)
 OFFLOAD_FUNC(olReleaseKernelWithCodeLoc)
 OFFLOAD_FUNC(olSetKernelArgValueWithCodeLoc)
+OFFLOAD_FUNC(olSetKernelArgsDataWithCodeLoc)
 
 #undef OFFLOAD_FUNC
diff --git a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
index 9d21d8fc97090..9401b20f97c11 100644
--- a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
+++ b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
@@ -97,3 +97,6 @@ ol_impl_result_t olReleaseKernel_impl(ol_kernel_handle_t Kernel);
 ol_impl_result_t olSetKernelArgValue_impl(ol_kernel_handle_t Kernel,
                                           uint32_t Index, size_t Size,
                                           void *ArgData);
+
+ol_impl_result_t olSetKernelArgsData_impl(ol_kernel_handle_t Kernel,
+                                          void *ArgsData, size_t ArgsDataSize);
diff --git a/offload/liboffload/include/generated/OffloadPrint.hpp b/offload/liboffload/include/generated/OffloadPrint.hpp
index 698b422fc38d0..a9656d4ee45d6 100644
--- a/offload/liboffload/include/generated/OffloadPrint.hpp
+++ b/offload/liboffload/include/generated/OffloadPrint.hpp
@@ -665,6 +665,20 @@ operator<<(std::ostream &os,
   return os;
 }
 
+inline std::ostream &
+operator<<(std::ostream &os,
+           const struct ol_set_kernel_args_data_params_t *params) {
+  os << ".Kernel = ";
+  printPtr(os, *params->pKernel);
+  os << ", ";
+  os << ".ArgsData = ";
+  printPtr(os, *params->pArgsData);
+  os << ", ";
+  os << ".ArgsDataSize = ";
+  os << *params->pArgsDataSize;
+  return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // @brief Print pointer value
 template <typename T>
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 1fbb424021eef..7d57c0696ad9e 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -54,40 +54,20 @@ struct ol_program_handle_t_ {
 };
 
 struct OffloadArguments {
-  static constexpr size_t MaxParamBytes = 4000u;
+  static constexpr size_t MaxParamBytes = 4096u;
   using args_t = std::array<char, MaxParamBytes>;
   using args_size_t = std::vector<size_t>;
-  using args_index_t = std::vector<void *>;
+  using args_ptr_t = std::vector<void *>;
   args_t Storage;
   args_size_t ParamSizes;
-  args_index_t Indices;
-  args_size_t OffsetPerIndex;
-
-  std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0};
-
-  // NOTE:
-  // This implementation is an exact copy of the CUDA adapter's argument
-  // implementation. Even though it was designed for CUDA, the design of
-  // libomptarget means it should work for other plugins as they will expect
-  // the same argument layout.
-  OffloadArguments() {
-    // Place the implicit offset index at the end of the indicies collection
-    Indices.emplace_back(&ImplicitOffsetArgs);
-  }
-
-  /// Add an argument to the kernel.
-  /// If the argument existed before, it is replaced.
-  /// Otherwise, it is added.
-  /// Gaps are filled with empty arguments.
-  /// Implicit offset argument is kept at the back of the indices collection.
-  void addArg(size_t Index, size_t Size, const void *Arg,
-              size_t LocalSize = 0) {
-    if (Index + 2 > Indices.size()) {
-      // Move implicit offset argument index with the end
-      Indices.resize(Index + 2, Indices.back());
-      // Ensure enough space for the new argument
+  args_ptr_t Pointers;
+
+  // Add an argument. If it already exists, it is replaced. Gaps are filled with
+  // empty arguments. Previous setArgsData calls are invalidated.
+  void addArg(size_t Index, size_t Size, const void *Arg) {
+    if (Index + 1 > Pointers.size()) {
+      Pointers.resize(Index + 1);
       ParamSizes.resize(Index + 1);
-      OffsetPerIndex.resize(Index + 1);
     }
     ParamSizes[Index] = Size;
     // calculate the insertion point on the array
@@ -95,46 +75,17 @@ struct OffloadArguments {
                                        std::begin(ParamSizes) + Index, 0);
     // Update the stored value for the argument
     std::memcpy(&Storage[InsertPos], Arg, Size);
-    Indices[Index] = &Storage[InsertPos];
-    OffsetPerIndex[Index] = LocalSize;
+    Pointers[Index] = &Storage[InsertPos];
   }
 
-  void addLocalArg(size_t Index, size_t Size) {
-    size_t LocalOffset = this->getLocalSize();
-
-    // maximum required alignment is the size of the largest vector type
-    const size_t MaxAlignment = sizeof(double) * 16;
-
-    // for arguments smaller than the maximum alignment simply align to the
-    // size of the argument
-    const size_t Alignment = std::min(MaxAlignment, Size);
-
-    // align the argument
-    size_t AlignedLocalOffset = LocalOffset;
-    size_t Pad = LocalOffset % Alignment;
-    if (Pad != 0) {
-      AlignedLocalOffset += Alignment - Pad;
-    }
-
-    addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
-           Size + (AlignedLocalOffset - LocalOffset));
+  // Set all argument data at once. Previous addArg calls are invalidated.
+  void setArgsData(const void *Data, size_t Size) {
+    std::memcpy(Storage.data(), Data, Size);
+    Pointers.clear();
+    ParamSizes.clear();
   }
 
-  void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
-    assert(Size == sizeof(std::uint32_t) * 3);
-    std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
-  }
-
-  void clearLocalSize() {
-    std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
-  }
-
-  const args_index_t &getIndices() const noexcept { return Indices; }
-
-  uint32_t getLocalSize() const {
-    return std::accumulate(std::begin(OffsetPerIndex), std::end(OffsetPerIndex),
-                           0);
-  }
+  const args_ptr_t &getPointers() const noexcept { return Pointers; }
 
   const char *getStorage() const noexcept { return Storage.data(); }
 };
@@ -618,7 +569,7 @@ ol_impl_result_t olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue,
   AsyncInfoWrapperTy AsyncInfoWrapper(DeviceImpl, Queue->AsyncInfo);
 
   KernelArgsTy LaunchArgs{};
-  LaunchArgs.NumArgs = Kernel->Args.getIndices().size() - 1; // TODO
+  LaunchArgs.NumArgs = Kernel->Args.getPointers().size();
   LaunchArgs.NumTeams[0] = GlobalWorkSize[0];
   LaunchArgs.NumTeams[1] = 1;
   LaunchArgs.NumTeams[2] = 1;
@@ -628,7 +579,7 @@ ol_impl_result_t olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue,
 
   LaunchArgs.ArgPtrs = (void **)Kernel->Args.getStorage();
 
-  // TODO: Verify this
+  // No offsets needed, arguments are real pointers
   auto ArgOffsets = std::vector<ptrdiff_t>(LaunchArgs.NumArgs, 0ul);
 
   auto Err = Kernel->KernelImpl->launch(
@@ -646,3 +597,9 @@ ol_impl_result_t olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue,
 
   return OL_SUCCESS;
 }
+
+ol_impl_result_t olSetKernelArgsData_impl(ol_kernel_handle_t Kernel,
+                                          void *ArgsData, size_t ArgsDataSize) {
+  Kernel->Args.setArgsData(ArgsData, ArgsDataSize);
+  return OL_SUCCESS;
+}

>From df9eb3e97edc8504c86e3940bb0f1f3065a9658e Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Tue, 4 Feb 2025 17:23:06 +0000
Subject: [PATCH 06/17] Update Offload unit tests

---
 offload/unittests/OffloadAPI/CMakeLists.txt   |  8 ++++
 .../unittests/OffloadAPI/common/Fixtures.hpp  | 18 ++++++-
 .../OffloadAPI/enqueue/olEnqueueDataCopy.cpp  | 36 ++++++++++++++
 .../OffloadAPI/enqueue/olEnqueueDataRead.cpp  | 29 ++++++++++++
 .../OffloadAPI/enqueue/olEnqueueDataWrite.cpp | 26 ++++++++++
 .../OffloadAPI/memory/olMemAlloc.cpp          | 45 ++++++++++++++++++
 .../unittests/OffloadAPI/memory/olMemFree.cpp | 47 +++++++++++++++++++
 .../OffloadAPI/platform/olPlatformInfo.hpp    |  1 +
 .../OffloadAPI/queue/olCreateQueue.cpp        |  9 ++++
 .../OffloadAPI/queue/olFinishQueue.cpp        | 17 +++++++
 .../OffloadAPI/queue/olReleaseQueue.cpp       | 21 +++++++++
 .../OffloadAPI/queue/olRetainQueue.cpp        | 20 ++++++++
 12 files changed, 276 insertions(+), 1 deletion(-)
 create mode 100644 offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp
 create mode 100644 offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp
 create mode 100644 offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp
 create mode 100644 offload/unittests/OffloadAPI/memory/olMemAlloc.cpp
 create mode 100644 offload/unittests/OffloadAPI/memory/olMemFree.cpp
 create mode 100644 offload/unittests/OffloadAPI/queue/olFinishQueue.cpp
 create mode 100644 offload/unittests/OffloadAPI/queue/olReleaseQueue.cpp
 create mode 100644 offload/unittests/OffloadAPI/queue/olRetainQueue.cpp

diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index e0d790684898d..c7f28d147db14 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -12,6 +12,14 @@ add_libompt_unittest("offload.unittests"
     ${CMAKE_CURRENT_SOURCE_DIR}/device/olGetDeviceInfo.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/device/olGetDeviceInfoSize.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/queue/olCreateQueue.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/queue/olFinishQueue.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/queue/olReleaseQueue.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/queue/olRetainQueue.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/memory/olMemAlloc.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/memory/olMemFree.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/enqueue/olEnqueueDataWrite.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/enqueue/olEnqueueDataRead.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/enqueue/olEnqueueDataCopy.cpp
     )
 add_dependencies("offload.unittests" ${PLUGINS_TEST_COMMON})
 target_link_libraries("offload.unittests" PRIVATE ${PLUGINS_TEST_COMMON})
diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp
index 410a435dee1b5..2b85137a77d82 100644
--- a/offload/unittests/OffloadAPI/common/Fixtures.hpp
+++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp
@@ -60,5 +60,21 @@ struct offloadDeviceTest : offloadPlatformTest {
     ASSERT_SUCCESS(olGetDevice(Platform, 1, &Device));
   }
 
-  ol_device_handle_t Device;
+  ol_device_handle_t Device = nullptr;
+};
+
+struct offloadQueueTest : offloadDeviceTest {
+  void SetUp() override {
+    RETURN_ON_FATAL_FAILURE(offloadDeviceTest::SetUp());
+    ASSERT_SUCCESS(olCreateQueue(Device, &Queue));
+  }
+
+  void TearDown() override {
+    if (Queue) {
+      olReleaseQueue(Queue);
+    }
+    RETURN_ON_FATAL_FAILURE(offloadDeviceTest::TearDown());
+  }
+
+  ol_queue_handle_t Queue = nullptr;
 };
diff --git a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp
new file mode 100644
index 0000000000000..afc5866821e36
--- /dev/null
+++ b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp
@@ -0,0 +1,36 @@
+//===------- Offload API tests - olEnqueueDataCopy ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olEnqueueDataCopyTest = offloadQueueTest;
+
+TEST_F(olEnqueueDataCopyTest, Success) {
+  constexpr size_t Size = 1024;
+  void *AllocA;
+  void *AllocB;
+  std::vector<uint8_t> Input(Size, 42);
+  std::vector<uint8_t> Output(Size, 0);
+
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, 0, &AllocA));
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, 0, &AllocB));
+  ASSERT_SUCCESS(
+      olEnqueueDataWrite(Queue, Input.data(), AllocA, Size, nullptr));
+  ASSERT_SUCCESS(
+      olEnqueueDataCopy(Queue, AllocA, AllocB, Device, Size, nullptr));
+  ASSERT_SUCCESS(
+      olEnqueueDataRead(Queue, AllocB, Output.data(), Size, nullptr));
+  ASSERT_SUCCESS(olFinishQueue(Queue));
+  for (uint8_t Val : Output) {
+    ASSERT_EQ(Val, 42);
+  }
+  ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_DEVICE, AllocA));
+  ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_DEVICE, AllocB));
+}
diff --git a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp
new file mode 100644
index 0000000000000..76d3490cc8737
--- /dev/null
+++ b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp
@@ -0,0 +1,29 @@
+//===------- Offload API tests - olEnqueueDataRead ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olEnqueueDataReadTest = offloadQueueTest;
+
+TEST_F(olEnqueueDataReadTest, Success) {
+  constexpr size_t Size = 1024;
+  void *Alloc;
+  std::vector<uint8_t> Input(Size, 42);
+  std::vector<uint8_t> Output(Size, 0);
+
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, 0, &Alloc));
+  ASSERT_SUCCESS(olEnqueueDataWrite(Queue, Input.data(), Alloc, Size, nullptr));
+  ASSERT_SUCCESS(olEnqueueDataRead(Queue, Alloc, Output.data(), Size, nullptr));
+  ASSERT_SUCCESS(olFinishQueue(Queue));
+  for (uint8_t Val : Output) {
+    ASSERT_EQ(Val, 42);
+  }
+  ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc));
+}
diff --git a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp
new file mode 100644
index 0000000000000..ad66887643d56
--- /dev/null
+++ b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp
@@ -0,0 +1,26 @@
+//===------- Offload API tests - olEnqueueDataWrite -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olEnqueueDataWriteTest = offloadQueueTest;
+
+TEST_F(olEnqueueDataWriteTest, Success) {
+    constexpr size_t Size = 1024;
+    void *Alloc;
+    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, 0, &Alloc));
+    std::vector<uint8_t> Input(Size, 42);
+    ASSERT_SUCCESS(
+        olEnqueueDataWrite(Queue, Input.data(), Alloc, Size, nullptr));
+    olFinishQueue(Queue);
+    olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc);
+}
+
+
diff --git a/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp b/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp
new file mode 100644
index 0000000000000..e951231d4a0e9
--- /dev/null
+++ b/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp
@@ -0,0 +1,45 @@
+//===------- Offload API tests - olMemAlloc -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olMemAllocTest = offloadDeviceTest;
+
+TEST_F(olMemAllocTest, SuccessAllocShared) {
+  void *Alloc = nullptr;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_SHARED, 1024, 0, &Alloc));
+  ASSERT_NE(Alloc, nullptr);
+  olMemFree(Device, OL_ALLOC_TYPE_SHARED, Alloc);
+}
+
+TEST_F(olMemAllocTest, SuccessAllocHost) {
+    void *Alloc = nullptr;
+    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_HOST, 1024, 0, &Alloc));
+    ASSERT_NE(Alloc, nullptr);
+    olMemFree(Device, OL_ALLOC_TYPE_HOST, Alloc);
+}
+
+TEST_F(olMemAllocTest, SuccessAllocDevice) {
+  void *Alloc = nullptr;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, 0, &Alloc));
+  ASSERT_NE(Alloc, nullptr);
+  olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc);
+}
+
+TEST_F(olMemAllocTest, InvalidNullDevice) {
+  void *Alloc = nullptr;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olMemAlloc(nullptr, OL_ALLOC_TYPE_DEVICE, 1024, 0, &Alloc));
+}
+
+TEST_F(olMemAllocTest, InvalidNullOutPtr) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, 0, nullptr));
+}
diff --git a/offload/unittests/OffloadAPI/memory/olMemFree.cpp b/offload/unittests/OffloadAPI/memory/olMemFree.cpp
new file mode 100644
index 0000000000000..54e8a24f9fbba
--- /dev/null
+++ b/offload/unittests/OffloadAPI/memory/olMemFree.cpp
@@ -0,0 +1,47 @@
+//===------- Offload API tests - olMemFree --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olMemFreeTest = offloadDeviceTest;
+
+TEST_F(olMemFreeTest, SuccessFreeShared) {
+  void *Alloc = nullptr;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_SHARED, 1024, 0, &Alloc));
+  ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_SHARED, Alloc));
+}
+
+TEST_F(olMemFreeTest, SuccessFreeHost) {
+    void *Alloc = nullptr;
+    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_HOST, 1024, 0, &Alloc));
+    ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_HOST, Alloc));
+}
+
+TEST_F(olMemFreeTest, SuccessFreeDevice) {
+  void *Alloc = nullptr;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, 0, &Alloc));
+  ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc));
+}
+
+TEST_F(olMemFreeTest, InvalidNullDevice) {
+  void *Alloc = nullptr;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, 0, &Alloc));
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olMemFree(nullptr, OL_ALLOC_TYPE_DEVICE, &Alloc));
+  ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc));
+}
+
+TEST_F(olMemFreeTest, InvalidNullPtr) {
+  void *Alloc = nullptr;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, 0, &Alloc));
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olMemFree(nullptr, OL_ALLOC_TYPE_DEVICE, &Alloc));
+  ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc));
+}
diff --git a/offload/unittests/OffloadAPI/platform/olPlatformInfo.hpp b/offload/unittests/OffloadAPI/platform/olPlatformInfo.hpp
index d49cdb90d321a..f61bca0cf52f0 100644
--- a/offload/unittests/OffloadAPI/platform/olPlatformInfo.hpp
+++ b/offload/unittests/OffloadAPI/platform/olPlatformInfo.hpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 #pragma once
 
+#include <unordered_map>
 #include <vector>
 
 // TODO: We could autogenerate these
diff --git a/offload/unittests/OffloadAPI/queue/olCreateQueue.cpp b/offload/unittests/OffloadAPI/queue/olCreateQueue.cpp
index f542dac4bb2d8..0e19f03c11776 100644
--- a/offload/unittests/OffloadAPI/queue/olCreateQueue.cpp
+++ b/offload/unittests/OffloadAPI/queue/olCreateQueue.cpp
@@ -17,3 +17,12 @@ TEST_F(olCreateQueueTest, Success) {
   ASSERT_SUCCESS(olCreateQueue(Device, &Queue));
   ASSERT_NE(Queue, nullptr);
 }
+
+TEST_F(olCreateQueueTest, InvalidNullHandleDevice) {
+  ol_queue_handle_t Queue = nullptr;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, olCreateQueue(nullptr, &Queue));
+}
+
+TEST_F(olCreateQueueTest, InvalidNullPointerQueue) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, olCreateQueue(Device, nullptr));
+}
diff --git a/offload/unittests/OffloadAPI/queue/olFinishQueue.cpp b/offload/unittests/OffloadAPI/queue/olFinishQueue.cpp
new file mode 100644
index 0000000000000..7c7d3553083fb
--- /dev/null
+++ b/offload/unittests/OffloadAPI/queue/olFinishQueue.cpp
@@ -0,0 +1,17 @@
+//===------- Offload API tests - olWaitQueue ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olFinishQueueTest = offloadQueueTest;
+
+TEST_F(olFinishQueueTest, SuccessEmptyQueue) {
+  ASSERT_SUCCESS(olFinishQueue(Queue));
+}
diff --git a/offload/unittests/OffloadAPI/queue/olReleaseQueue.cpp b/offload/unittests/OffloadAPI/queue/olReleaseQueue.cpp
new file mode 100644
index 0000000000000..392f49bc2f80d
--- /dev/null
+++ b/offload/unittests/OffloadAPI/queue/olReleaseQueue.cpp
@@ -0,0 +1,21 @@
+//===------- Offload API tests - olRetainQueue ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olReleaseQueueTest = offloadQueueTest;
+
+// TODO: When we can fetch queue info we can check the reference count is
+// changing in an expected way. In the meantime just check the entry point
+// doesn't blow up.
+TEST_F(olReleaseQueueTest, Success) {
+  ASSERT_SUCCESS(olRetainQueue(Queue));
+  ASSERT_SUCCESS(olReleaseQueue(Queue));
+}
diff --git a/offload/unittests/OffloadAPI/queue/olRetainQueue.cpp b/offload/unittests/OffloadAPI/queue/olRetainQueue.cpp
new file mode 100644
index 0000000000000..9e499d849c742
--- /dev/null
+++ b/offload/unittests/OffloadAPI/queue/olRetainQueue.cpp
@@ -0,0 +1,20 @@
+//===------- Offload API tests - olRetainQueue ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olRetainQueueTest = offloadQueueTest;
+
+// TODO: When we can fetch queue info we can check the reference count is
+// changing in the expected way. In the meantime just check the entry point
+// doesn't blow up.
+TEST_F(olRetainQueueTest, Success) {
+  ASSERT_SUCCESS(olRetainQueue(Queue));
+}

>From 71326aedc8a5748b15652054a42e61a173501d21 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Wed, 5 Feb 2025 11:53:19 +0000
Subject: [PATCH 07/17] Kernel launch size arguments

---
 offload/liboffload/API/Enqueue.td             | 16 +++++-
 .../liboffload/include/generated/OffloadAPI.h | 24 ++++++--
 .../include/generated/OffloadEntryPoints.inc  | 34 ++++++------
 .../generated/OffloadImplFuncDecls.inc        |  8 +--
 .../include/generated/OffloadPrint.hpp        | 55 ++++++++++++++++++-
 offload/liboffload/src/OffloadImpl.cpp        | 20 +++----
 offload/tools/offload-tblgen/PrintGen.cpp     | 36 +++++++++++-
 offload/tools/offload-tblgen/RecordTypes.hpp  |  2 +
 8 files changed, 153 insertions(+), 42 deletions(-)

diff --git a/offload/liboffload/API/Enqueue.td b/offload/liboffload/API/Enqueue.td
index 621eb3a2f410e..d9215e8175ef8 100644
--- a/offload/liboffload/API/Enqueue.td
+++ b/offload/liboffload/API/Enqueue.td
@@ -54,6 +54,20 @@ def : Function {
 }
 
 
+def : Struct {
+    let name = "ol_kernel_launch_size_args_t";
+    let desc = "Size-related arguments for a kernel launch.";
+    let members = [
+        StructMember<"size_t", "Dimensions", "Number of work dimensions">,
+        StructMember<"size_t", "NumGroupsX", "Number of work groups on the X dimension">,
+        StructMember<"size_t", "NumGroupsY", "Number of work groups on the Y dimension">,
+        StructMember<"size_t", "NumGroupsZ", "Number of work groups on the Z dimension">,
+        StructMember<"size_t", "GroupSizeX", "Size of a work group on the X dimension.">,
+        StructMember<"size_t", "GroupSizeY", "Size of a work group on the Y dimension.">,
+        StructMember<"size_t", "GroupSizeZ", "Size of a work group on the Z dimension.">
+    ];
+}
+
 def : Function {
     let name = "olEnqueueKernelLaunch";
     let desc = "Enqueue a kernel launch with the specified size and parameters";
@@ -61,7 +75,7 @@ def : Function {
     let params = [
         Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
         Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
-        Param<"const size_t*", "GlobalWorkSize", "an array of size 3 representing the global work size", PARAM_IN>,
+        Param<"const ol_kernel_launch_size_args_t*", "LaunchSizeArgs", "pointer to the struct containing launch size parameters", PARAM_IN>,
         Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
     ];
     let returns = [];
diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index 155e31338c88b..4f4ff51bef5d0 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -735,6 +735,18 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataCopy(
     // [out][optional] optional recorded event for the enqueued operation
     ol_event_handle_t *EventOut);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Size-related arguments for a kernel launch.
+typedef struct ol_kernel_launch_size_args_t {
+  size_t Dimensions; /// Number of work dimensions
+  size_t NumGroupsX; /// Number of work groups on the X dimension
+  size_t NumGroupsY; /// Number of work groups on the Y dimension
+  size_t NumGroupsZ; /// Number of work groups on the Z dimension
+  size_t GroupSizeX; /// Size of a work group on the X dimension.
+  size_t GroupSizeY; /// Size of a work group on the Y dimension.
+  size_t GroupSizeZ; /// Size of a work group on the Z dimension.
+} ol_kernel_launch_size_args_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Enqueue a kernel launch with the specified size and parameters
 ///
@@ -748,14 +760,14 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataCopy(
 ///         + `NULL == Queue`
 ///         + `NULL == Kernel`
 ///     - ::OL_ERRC_INVALID_NULL_POINTER
-///         + `NULL == GlobalWorkSize`
+///         + `NULL == LaunchSizeArgs`
 OL_APIEXPORT ol_result_t OL_APICALL olEnqueueKernelLaunch(
     // [in] handle of the queue
     ol_queue_handle_t Queue,
     // [in] handle of the kernel
     ol_kernel_handle_t Kernel,
-    // [in] an array of size 3 representing the global work size
-    const size_t *GlobalWorkSize,
+    // [in] pointer to the struct containing launch size parameters
+    const ol_kernel_launch_size_args_t *LaunchSizeArgs,
     // [out][optional] optional recorded event for the enqueued operation
     ol_event_handle_t *EventOut);
 
@@ -1099,7 +1111,7 @@ typedef struct ol_enqueue_data_copy_params_t {
 typedef struct ol_enqueue_kernel_launch_params_t {
   ol_queue_handle_t *pQueue;
   ol_kernel_handle_t *pKernel;
-  const size_t **pGlobalWorkSize;
+  const ol_kernel_launch_size_args_t **pLaunchSizeArgs;
   ol_event_handle_t **pEventOut;
 } ol_enqueue_kernel_launch_params_t;
 
@@ -1339,8 +1351,8 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataCopyWithCodeLoc(
 /// @details See also ::olEnqueueKernelLaunch
 OL_APIEXPORT ol_result_t OL_APICALL olEnqueueKernelLaunchWithCodeLoc(
     ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
-    const size_t *GlobalWorkSize, ol_event_handle_t *EventOut,
-    ol_code_location_t *CodeLocation);
+    const ol_kernel_launch_size_args_t *LaunchSizeArgs,
+    ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Variant of olCreateProgram that also sets source code location
diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc
index fd022795a5d40..57cf4d64744cc 100644
--- a/offload/liboffload/include/generated/OffloadEntryPoints.inc
+++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc
@@ -964,10 +964,10 @@ ol_result_t olEnqueueDataCopyWithCodeLoc(ol_queue_handle_t Queue, void *SrcPtr,
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-ol_impl_result_t olEnqueueKernelLaunch_val(ol_queue_handle_t Queue,
-                                           ol_kernel_handle_t Kernel,
-                                           const size_t *GlobalWorkSize,
-                                           ol_event_handle_t *EventOut) {
+ol_impl_result_t
+olEnqueueKernelLaunch_val(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
+                          const ol_kernel_launch_size_args_t *LaunchSizeArgs,
+                          ol_event_handle_t *EventOut) {
   if (true /*enableParameterValidation*/) {
     if (NULL == Queue) {
       return OL_ERRC_INVALID_NULL_HANDLE;
@@ -977,26 +977,27 @@ ol_impl_result_t olEnqueueKernelLaunch_val(ol_queue_handle_t Queue,
       return OL_ERRC_INVALID_NULL_HANDLE;
     }
 
-    if (NULL == GlobalWorkSize) {
+    if (NULL == LaunchSizeArgs) {
       return OL_ERRC_INVALID_NULL_POINTER;
     }
   }
 
-  return olEnqueueKernelLaunch_impl(Queue, Kernel, GlobalWorkSize, EventOut);
+  return olEnqueueKernelLaunch_impl(Queue, Kernel, LaunchSizeArgs, EventOut);
 }
-OL_APIEXPORT ol_result_t OL_APICALL olEnqueueKernelLaunch(
-    ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
-    const size_t *GlobalWorkSize, ol_event_handle_t *EventOut) {
+OL_APIEXPORT ol_result_t OL_APICALL
+olEnqueueKernelLaunch(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
+                      const ol_kernel_launch_size_args_t *LaunchSizeArgs,
+                      ol_event_handle_t *EventOut) {
   if (offloadConfig().TracingEnabled) {
     std::cout << "---> olEnqueueKernelLaunch";
   }
 
   ol_result_t Result =
-      olEnqueueKernelLaunch_val(Queue, Kernel, GlobalWorkSize, EventOut);
+      olEnqueueKernelLaunch_val(Queue, Kernel, LaunchSizeArgs, EventOut);
 
   if (offloadConfig().TracingEnabled) {
     ol_enqueue_kernel_launch_params_t Params = {&Queue, &Kernel,
-                                                &GlobalWorkSize, &EventOut};
+                                                &LaunchSizeArgs, &EventOut};
     std::cout << "(" << &Params << ")";
     std::cout << "-> " << Result << "\n";
     if (Result && Result->Details) {
@@ -1005,14 +1006,13 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueKernelLaunch(
   }
   return Result;
 }
-ol_result_t olEnqueueKernelLaunchWithCodeLoc(ol_queue_handle_t Queue,
-                                             ol_kernel_handle_t Kernel,
-                                             const size_t *GlobalWorkSize,
-                                             ol_event_handle_t *EventOut,
-                                             ol_code_location_t *CodeLocation) {
+ol_result_t olEnqueueKernelLaunchWithCodeLoc(
+    ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
+    const ol_kernel_launch_size_args_t *LaunchSizeArgs,
+    ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation) {
   currentCodeLocation() = CodeLocation;
   ol_result_t Result =
-      olEnqueueKernelLaunch(Queue, Kernel, GlobalWorkSize, EventOut);
+      olEnqueueKernelLaunch(Queue, Kernel, LaunchSizeArgs, EventOut);
 
   currentCodeLocation() = nullptr;
   return Result;
diff --git a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
index 9401b20f97c11..e7204e594973e 100644
--- a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
+++ b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
@@ -73,10 +73,10 @@ ol_impl_result_t olEnqueueDataCopy_impl(ol_queue_handle_t Queue, void *SrcPtr,
                                         size_t Size,
                                         ol_event_handle_t *EventOut);
 
-ol_impl_result_t olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue,
-                                            ol_kernel_handle_t Kernel,
-                                            const size_t *GlobalWorkSize,
-                                            ol_event_handle_t *EventOut);
+ol_impl_result_t
+olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
+                           const ol_kernel_launch_size_args_t *LaunchSizeArgs,
+                           ol_event_handle_t *EventOut);
 
 ol_impl_result_t olCreateProgram_impl(ol_device_handle_t Device, void *ProgData,
                                       size_t ProgDataSize,
diff --git a/offload/liboffload/include/generated/OffloadPrint.hpp b/offload/liboffload/include/generated/OffloadPrint.hpp
index a9656d4ee45d6..157bee0cd07a8 100644
--- a/offload/liboffload/include/generated/OffloadPrint.hpp
+++ b/offload/liboffload/include/generated/OffloadPrint.hpp
@@ -309,6 +309,57 @@ inline std::ostream &operator<<(std::ostream &os,
   }
   return os;
 }
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ol_code_location_t type
+/// @returns std::ostream &
+
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ol_code_location_t params) {
+  os << "(struct ol_code_location_t){";
+  os << ".FunctionName = ";
+  printPtr(os, params.FunctionName);
+  os << ", ";
+  os << ".SourceFile = ";
+  printPtr(os, params.SourceFile);
+  os << ", ";
+  os << ".LineNumber = ";
+  os << params.LineNumber;
+  os << ", ";
+  os << ".ColumnNumber = ";
+  os << params.ColumnNumber;
+  os << "}";
+  return os;
+}
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ol_kernel_launch_size_args_t type
+/// @returns std::ostream &
+
+inline std::ostream &
+operator<<(std::ostream &os, const struct ol_kernel_launch_size_args_t params) {
+  os << "(struct ol_kernel_launch_size_args_t){";
+  os << ".Dimensions = ";
+  os << params.Dimensions;
+  os << ", ";
+  os << ".NumGroupsX = ";
+  os << params.NumGroupsX;
+  os << ", ";
+  os << ".NumGroupsY = ";
+  os << params.NumGroupsY;
+  os << ", ";
+  os << ".NumGroupsZ = ";
+  os << params.NumGroupsZ;
+  os << ", ";
+  os << ".GroupSizeX = ";
+  os << params.GroupSizeX;
+  os << ", ";
+  os << ".GroupSizeY = ";
+  os << params.GroupSizeY;
+  os << ", ";
+  os << ".GroupSizeZ = ";
+  os << params.GroupSizeZ;
+  os << "}";
+  return os;
+}
 
 inline std::ostream &operator<<(std::ostream &os,
                                 const struct ol_get_platform_params_t *params) {
@@ -583,8 +634,8 @@ operator<<(std::ostream &os,
   os << ".Kernel = ";
   printPtr(os, *params->pKernel);
   os << ", ";
-  os << ".GlobalWorkSize = ";
-  printPtr(os, *params->pGlobalWorkSize);
+  os << ".LaunchSizeArgs = ";
+  printPtr(os, *params->pLaunchSizeArgs);
   os << ", ";
   os << ".EventOut = ";
   printPtr(os, *params->pEventOut);
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 7d57c0696ad9e..a0057879b2bbe 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -560,22 +560,22 @@ ol_impl_result_t olSetKernelArgValue_impl(ol_kernel_handle_t Kernel,
   return OL_SUCCESS;
 }
 
-ol_impl_result_t olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue,
-                                            ol_kernel_handle_t Kernel,
-                                            const size_t *GlobalWorkSize,
-                                            ol_event_handle_t *EventOut) {
+ol_impl_result_t
+olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
+                           const ol_kernel_launch_size_args_t *LaunchSizeArgs,
+                           ol_event_handle_t *EventOut) {
   auto &DeviceImpl = Queue->Device->Device;
 
   AsyncInfoWrapperTy AsyncInfoWrapper(DeviceImpl, Queue->AsyncInfo);
 
   KernelArgsTy LaunchArgs{};
   LaunchArgs.NumArgs = Kernel->Args.getPointers().size();
-  LaunchArgs.NumTeams[0] = GlobalWorkSize[0];
-  LaunchArgs.NumTeams[1] = 1;
-  LaunchArgs.NumTeams[2] = 1;
-  LaunchArgs.ThreadLimit[0] = 1;
-  LaunchArgs.ThreadLimit[1] = 1;
-  LaunchArgs.ThreadLimit[2] = 1;
+  LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroupsX;
+  LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroupsY;
+  LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumGroupsZ;
+  LaunchArgs.ThreadLimit[0] = LaunchSizeArgs->GroupSizeX;
+  LaunchArgs.ThreadLimit[1] = LaunchSizeArgs->GroupSizeY;
+  LaunchArgs.ThreadLimit[2] = LaunchSizeArgs->GroupSizeZ;
 
   LaunchArgs.ArgPtrs = (void **)Kernel->Args.getStorage();
 
diff --git a/offload/tools/offload-tblgen/PrintGen.cpp b/offload/tools/offload-tblgen/PrintGen.cpp
index 2a7c63c3dfd1f..d7a63b68451b0 100644
--- a/offload/tools/offload-tblgen/PrintGen.cpp
+++ b/offload/tools/offload-tblgen/PrintGen.cpp
@@ -20,7 +20,7 @@
 using namespace llvm;
 using namespace offload::tblgen;
 
-constexpr auto PrintEnumHeader =
+constexpr auto PrintTypeHeader =
     R"(///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the {0} type
 /// @returns std::ostream &
@@ -33,7 +33,7 @@ constexpr auto PrintTaggedEnumHeader =
 )";
 
 static void ProcessEnum(const EnumRec &Enum, raw_ostream &OS) {
-  OS << formatv(PrintEnumHeader, Enum.getName());
+  OS << formatv(PrintTypeHeader, Enum.getName());
   OS << formatv(
       "inline std::ostream &operator<<(std::ostream &os, enum {0} value) "
       "{{\n" TAB_1 "switch (value) {{\n",
@@ -150,6 +150,33 @@ inline std::ostream &operator<<(std::ostream &os, const struct {0} *params) {{
   OS << TAB_1 "return os;\n}\n";
 }
 
+
+void ProcessStruct(const StructRec &Struct, raw_ostream &OS) {
+  if (Struct.getName() == "ol_error_struct_t") {
+    return;
+  }
+  OS << formatv(PrintTypeHeader, Struct.getName());
+  OS << formatv(R"(
+inline std::ostream &operator<<(std::ostream &os, const struct {0} params) {{
+)",
+                Struct.getName());
+  OS << formatv(TAB_1 "os << \"(struct {0}){{\";\n", Struct.getName());
+  for (const auto &Member : Struct.getMembers()) {
+    OS << formatv(TAB_1 "os << \".{0} = \";\n", Member.getName());
+    if (Member.isPointerType() || Member.isHandleType()) {
+      OS << formatv(TAB_1 "printPtr(os, params.{0});\n", Member.getName());
+    } else {
+      OS << formatv(TAB_1 "os << params.{0};\n", Member.getName());
+    }
+    if (Member.getName() != Struct.getMembers().back().getName()) {
+      OS << TAB_1 "os << \", \";\n";
+    }
+  }
+  OS << TAB_1 "os << \"}\";\n";
+  OS << TAB_1 "return os;\n";
+  OS << "}\n";
+}
+
 void EmitOffloadPrintHeader(const RecordKeeper &Records, raw_ostream &OS) {
   OS << GenericHeader;
   OS << R"""(
@@ -193,6 +220,11 @@ template <typename T> inline void printTagged(std::ostream &os, const void *ptr,
   }
   EmitResultPrint(OS);
 
+  for (auto *R : Records.getAllDerivedDefinitions("Struct")) {
+    StructRec S{R};
+    ProcessStruct(S, OS);
+  }
+
   // Emit print functions for the function param structs
   for (auto *R : Records.getAllDerivedDefinitions("Function")) {
     EmitFunctionParamStructPrint(FunctionRec{R}, OS);
diff --git a/offload/tools/offload-tblgen/RecordTypes.hpp b/offload/tools/offload-tblgen/RecordTypes.hpp
index 0bf3256c525d9..9faf361f4dd76 100644
--- a/offload/tools/offload-tblgen/RecordTypes.hpp
+++ b/offload/tools/offload-tblgen/RecordTypes.hpp
@@ -103,6 +103,8 @@ class StructMemberRec {
   StringRef getType() const { return rec->getValueAsString("type"); }
   StringRef getName() const { return rec->getValueAsString("name"); }
   StringRef getDesc() const { return rec->getValueAsString("desc"); }
+  bool isPointerType() const { return getType().ends_with('*'); }
+  bool isHandleType() const { return getType().ends_with("_handle_t"); }
 
 private:
   const Record *rec;

>From 81bd64629136e444469c157ea79b2c2b248916f8 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Wed, 5 Feb 2025 12:14:49 +0000
Subject: [PATCH 08/17] Remove currently unused alignment param

---
 offload/liboffload/API/Memory.td                |  1 -
 .../liboffload/include/generated/OffloadAPI.h   |  5 +----
 .../include/generated/OffloadEntryPoints.inc    | 17 +++++++----------
 .../include/generated/OffloadImplFuncDecls.inc  |  2 +-
 .../include/generated/OffloadPrint.hpp          |  3 ---
 offload/liboffload/src/OffloadImpl.cpp          |  4 ++--
 .../OffloadAPI/enqueue/olEnqueueDataCopy.cpp    |  4 ++--
 .../OffloadAPI/enqueue/olEnqueueDataRead.cpp    |  2 +-
 .../OffloadAPI/enqueue/olEnqueueDataWrite.cpp   | 17 +++++++----------
 .../unittests/OffloadAPI/memory/olMemAlloc.cpp  | 16 ++++++++--------
 .../unittests/OffloadAPI/memory/olMemFree.cpp   | 14 +++++++-------
 11 files changed, 36 insertions(+), 49 deletions(-)

diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td
index c15ae6f6d21ca..2c3f4c83980d0 100644
--- a/offload/liboffload/API/Memory.td
+++ b/offload/liboffload/API/Memory.td
@@ -27,7 +27,6 @@ def : Function {
     Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>,
     Param<"ol_alloc_type_t", "Type", "type of the allocation", PARAM_IN>,
     Param<"size_t", "Size", "size of the allocation in bytes", PARAM_IN>,
-    Param<"size_t", "Aligment", "alignment of the allocation in bytes", PARAM_IN>,
     Param<"void**", "AllocationOut", "output for the allocated pointer", PARAM_OUT>
   ];
   let returns = [
diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index 4f4ff51bef5d0..950c0e37ae67c 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -513,8 +513,6 @@ OL_APIEXPORT ol_result_t OL_APICALL olMemAlloc(
     ol_alloc_type_t Type,
     // [in] size of the allocation in bytes
     size_t Size,
-    // [in] alignment of the allocation in bytes
-    size_t Aligment,
     // [out] output for the allocated pointer
     void **AllocationOut);
 
@@ -1008,7 +1006,6 @@ typedef struct ol_mem_alloc_params_t {
   ol_device_handle_t *pDevice;
   ol_alloc_type_t *pType;
   size_t *pSize;
-  size_t *pAligment;
   void ***pAllocationOut;
 } ol_mem_alloc_params_t;
 
@@ -1261,7 +1258,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olGetDeviceInfoSizeWithCodeLoc(
 /// @details See also ::olMemAlloc
 OL_APIEXPORT ol_result_t OL_APICALL olMemAllocWithCodeLoc(
     ol_device_handle_t Device, ol_alloc_type_t Type, size_t Size,
-    size_t Aligment, void **AllocationOut, ol_code_location_t *CodeLocation);
+    void **AllocationOut, ol_code_location_t *CodeLocation);
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Variant of olMemFree that also sets source code location information
diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc
index 57cf4d64744cc..6d8f1d7c7171f 100644
--- a/offload/liboffload/include/generated/OffloadEntryPoints.inc
+++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc
@@ -442,8 +442,7 @@ ol_result_t olGetDeviceInfoSizeWithCodeLoc(ol_device_handle_t Device,
 
 ///////////////////////////////////////////////////////////////////////////////
 ol_impl_result_t olMemAlloc_val(ol_device_handle_t Device, ol_alloc_type_t Type,
-                                size_t Size, size_t Aligment,
-                                void **AllocationOut) {
+                                size_t Size, void **AllocationOut) {
   if (true /*enableParameterValidation*/) {
     if (Size == 0) {
       return OL_ERRC_INVALID_SIZE;
@@ -458,22 +457,20 @@ ol_impl_result_t olMemAlloc_val(ol_device_handle_t Device, ol_alloc_type_t Type,
     }
   }
 
-  return olMemAlloc_impl(Device, Type, Size, Aligment, AllocationOut);
+  return olMemAlloc_impl(Device, Type, Size, AllocationOut);
 }
 OL_APIEXPORT ol_result_t OL_APICALL olMemAlloc(ol_device_handle_t Device,
                                                ol_alloc_type_t Type,
-                                               size_t Size, size_t Aligment,
+                                               size_t Size,
                                                void **AllocationOut) {
   if (offloadConfig().TracingEnabled) {
     std::cout << "---> olMemAlloc";
   }
 
-  ol_result_t Result =
-      olMemAlloc_val(Device, Type, Size, Aligment, AllocationOut);
+  ol_result_t Result = olMemAlloc_val(Device, Type, Size, AllocationOut);
 
   if (offloadConfig().TracingEnabled) {
-    ol_mem_alloc_params_t Params = {&Device, &Type, &Size, &Aligment,
-                                    &AllocationOut};
+    ol_mem_alloc_params_t Params = {&Device, &Type, &Size, &AllocationOut};
     std::cout << "(" << &Params << ")";
     std::cout << "-> " << Result << "\n";
     if (Result && Result->Details) {
@@ -484,10 +481,10 @@ OL_APIEXPORT ol_result_t OL_APICALL olMemAlloc(ol_device_handle_t Device,
 }
 ol_result_t olMemAllocWithCodeLoc(ol_device_handle_t Device,
                                   ol_alloc_type_t Type, size_t Size,
-                                  size_t Aligment, void **AllocationOut,
+                                  void **AllocationOut,
                                   ol_code_location_t *CodeLocation) {
   currentCodeLocation() = CodeLocation;
-  ol_result_t Result = olMemAlloc(Device, Type, Size, Aligment, AllocationOut);
+  ol_result_t Result = olMemAlloc(Device, Type, Size, AllocationOut);
 
   currentCodeLocation() = nullptr;
   return Result;
diff --git a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
index e7204e594973e..e7179e44fc9ec 100644
--- a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
+++ b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
@@ -39,7 +39,7 @@ ol_impl_result_t olGetDeviceInfoSize_impl(ol_device_handle_t Device,
 
 ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device,
                                  ol_alloc_type_t Type, size_t Size,
-                                 size_t Aligment, void **AllocationOut);
+                                 void **AllocationOut);
 
 ol_impl_result_t olMemFree_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
                                 void *Address);
diff --git a/offload/liboffload/include/generated/OffloadPrint.hpp b/offload/liboffload/include/generated/OffloadPrint.hpp
index 157bee0cd07a8..5271832451dd6 100644
--- a/offload/liboffload/include/generated/OffloadPrint.hpp
+++ b/offload/liboffload/include/generated/OffloadPrint.hpp
@@ -489,9 +489,6 @@ inline std::ostream &operator<<(std::ostream &os,
   os << ".Size = ";
   os << *params->pSize;
   os << ", ";
-  os << ".Aligment = ";
-  os << *params->pAligment;
-  os << ", ";
   os << ".AllocationOut = ";
   printPtr(os, *params->pAllocationOut);
   return os;
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index a0057879b2bbe..a2d3730d3b303 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -322,7 +322,7 @@ TargetAllocTy convertOlToPluginAllocTy(ol_alloc_type_t Type) {
 }
 
 ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device,
-                                 ol_alloc_type_t Type, size_t Size, size_t,
+                                 ol_alloc_type_t Type, size_t Size,
                                  void **AllocationOut) {
   auto Alloc =
       Device->Device.dataAlloc(Size, nullptr, convertOlToPluginAllocTy(Type));
@@ -485,7 +485,7 @@ ol_impl_result_t olCreateProgram_impl(ol_device_handle_t Device, void *ProgData,
                                       ol_program_handle_t *Program) {
   auto ImageData = MemoryBuffer::getMemBufferCopy(
       StringRef(reinterpret_cast<char *>(ProgData), ProgDataSize));
-  __tgt_device_image DeviceImage{(char *) ImageData->getBuffer().data(),
+  __tgt_device_image DeviceImage{(char *)ImageData->getBuffer().data(),
                                  ((char *)ImageData->getBuffer().data()) +
                                      ProgDataSize - 1,
                                  nullptr, nullptr};
diff --git a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp
index afc5866821e36..d15e738bc94e6 100644
--- a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp
+++ b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp
@@ -19,8 +19,8 @@ TEST_F(olEnqueueDataCopyTest, Success) {
   std::vector<uint8_t> Input(Size, 42);
   std::vector<uint8_t> Output(Size, 0);
 
-  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, 0, &AllocA));
-  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, 0, &AllocB));
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &AllocA));
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &AllocB));
   ASSERT_SUCCESS(
       olEnqueueDataWrite(Queue, Input.data(), AllocA, Size, nullptr));
   ASSERT_SUCCESS(
diff --git a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp
index 76d3490cc8737..5787889c4febb 100644
--- a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp
+++ b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp
@@ -18,7 +18,7 @@ TEST_F(olEnqueueDataReadTest, Success) {
   std::vector<uint8_t> Input(Size, 42);
   std::vector<uint8_t> Output(Size, 0);
 
-  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, 0, &Alloc));
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &Alloc));
   ASSERT_SUCCESS(olEnqueueDataWrite(Queue, Input.data(), Alloc, Size, nullptr));
   ASSERT_SUCCESS(olEnqueueDataRead(Queue, Alloc, Output.data(), Size, nullptr));
   ASSERT_SUCCESS(olFinishQueue(Queue));
diff --git a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp
index ad66887643d56..d3f3edf58a531 100644
--- a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp
+++ b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp
@@ -13,14 +13,11 @@
 using olEnqueueDataWriteTest = offloadQueueTest;
 
 TEST_F(olEnqueueDataWriteTest, Success) {
-    constexpr size_t Size = 1024;
-    void *Alloc;
-    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, 0, &Alloc));
-    std::vector<uint8_t> Input(Size, 42);
-    ASSERT_SUCCESS(
-        olEnqueueDataWrite(Queue, Input.data(), Alloc, Size, nullptr));
-    olFinishQueue(Queue);
-    olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc);
+  constexpr size_t Size = 1024;
+  void *Alloc;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &Alloc));
+  std::vector<uint8_t> Input(Size, 42);
+  ASSERT_SUCCESS(olEnqueueDataWrite(Queue, Input.data(), Alloc, Size, nullptr));
+  olFinishQueue(Queue);
+  olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc);
 }
-
-
diff --git a/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp b/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp
index e951231d4a0e9..47388801b2e58 100644
--- a/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp
+++ b/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp
@@ -14,21 +14,21 @@ using olMemAllocTest = offloadDeviceTest;
 
 TEST_F(olMemAllocTest, SuccessAllocShared) {
   void *Alloc = nullptr;
-  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_SHARED, 1024, 0, &Alloc));
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_SHARED, 1024, &Alloc));
   ASSERT_NE(Alloc, nullptr);
   olMemFree(Device, OL_ALLOC_TYPE_SHARED, Alloc);
 }
 
 TEST_F(olMemAllocTest, SuccessAllocHost) {
-    void *Alloc = nullptr;
-    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_HOST, 1024, 0, &Alloc));
-    ASSERT_NE(Alloc, nullptr);
-    olMemFree(Device, OL_ALLOC_TYPE_HOST, Alloc);
+  void *Alloc = nullptr;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_HOST, 1024, &Alloc));
+  ASSERT_NE(Alloc, nullptr);
+  olMemFree(Device, OL_ALLOC_TYPE_HOST, Alloc);
 }
 
 TEST_F(olMemAllocTest, SuccessAllocDevice) {
   void *Alloc = nullptr;
-  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, 0, &Alloc));
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, &Alloc));
   ASSERT_NE(Alloc, nullptr);
   olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc);
 }
@@ -36,10 +36,10 @@ TEST_F(olMemAllocTest, SuccessAllocDevice) {
 TEST_F(olMemAllocTest, InvalidNullDevice) {
   void *Alloc = nullptr;
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
-               olMemAlloc(nullptr, OL_ALLOC_TYPE_DEVICE, 1024, 0, &Alloc));
+               olMemAlloc(nullptr, OL_ALLOC_TYPE_DEVICE, 1024, &Alloc));
 }
 
 TEST_F(olMemAllocTest, InvalidNullOutPtr) {
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
-               olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, 0, nullptr));
+               olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, nullptr));
 }
diff --git a/offload/unittests/OffloadAPI/memory/olMemFree.cpp b/offload/unittests/OffloadAPI/memory/olMemFree.cpp
index 54e8a24f9fbba..647c81a4e9536 100644
--- a/offload/unittests/OffloadAPI/memory/olMemFree.cpp
+++ b/offload/unittests/OffloadAPI/memory/olMemFree.cpp
@@ -14,25 +14,25 @@ using olMemFreeTest = offloadDeviceTest;
 
 TEST_F(olMemFreeTest, SuccessFreeShared) {
   void *Alloc = nullptr;
-  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_SHARED, 1024, 0, &Alloc));
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_SHARED, 1024, &Alloc));
   ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_SHARED, Alloc));
 }
 
 TEST_F(olMemFreeTest, SuccessFreeHost) {
-    void *Alloc = nullptr;
-    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_HOST, 1024, 0, &Alloc));
-    ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_HOST, Alloc));
+  void *Alloc = nullptr;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_HOST, 1024, &Alloc));
+  ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_HOST, Alloc));
 }
 
 TEST_F(olMemFreeTest, SuccessFreeDevice) {
   void *Alloc = nullptr;
-  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, 0, &Alloc));
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, &Alloc));
   ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc));
 }
 
 TEST_F(olMemFreeTest, InvalidNullDevice) {
   void *Alloc = nullptr;
-  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, 0, &Alloc));
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, &Alloc));
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
                olMemFree(nullptr, OL_ALLOC_TYPE_DEVICE, &Alloc));
   ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc));
@@ -40,7 +40,7 @@ TEST_F(olMemFreeTest, InvalidNullDevice) {
 
 TEST_F(olMemFreeTest, InvalidNullPtr) {
   void *Alloc = nullptr;
-  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, 0, &Alloc));
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, &Alloc));
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
                olMemFree(nullptr, OL_ALLOC_TYPE_DEVICE, &Alloc));
   ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc));

>From 85391843fec4de6918022623f1a48c784f0f1106 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Wed, 5 Feb 2025 12:20:40 +0000
Subject: [PATCH 09/17] Fix formatting

---
 offload/tools/offload-tblgen/PrintGen.cpp            | 1 -
 offload/unittests/OffloadAPI/queue/olRetainQueue.cpp | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/offload/tools/offload-tblgen/PrintGen.cpp b/offload/tools/offload-tblgen/PrintGen.cpp
index d7a63b68451b0..43a9c8478e1ff 100644
--- a/offload/tools/offload-tblgen/PrintGen.cpp
+++ b/offload/tools/offload-tblgen/PrintGen.cpp
@@ -150,7 +150,6 @@ inline std::ostream &operator<<(std::ostream &os, const struct {0} *params) {{
   OS << TAB_1 "return os;\n}\n";
 }
 
-
 void ProcessStruct(const StructRec &Struct, raw_ostream &OS) {
   if (Struct.getName() == "ol_error_struct_t") {
     return;
diff --git a/offload/unittests/OffloadAPI/queue/olRetainQueue.cpp b/offload/unittests/OffloadAPI/queue/olRetainQueue.cpp
index 9e499d849c742..eec921ffba5ef 100644
--- a/offload/unittests/OffloadAPI/queue/olRetainQueue.cpp
+++ b/offload/unittests/OffloadAPI/queue/olRetainQueue.cpp
@@ -15,6 +15,4 @@ using olRetainQueueTest = offloadQueueTest;
 // TODO: When we can fetch queue info we can check the reference count is
 // changing in the expected way. In the meantime just check the entry point
 // doesn't blow up.
-TEST_F(olRetainQueueTest, Success) {
-  ASSERT_SUCCESS(olRetainQueue(Queue));
-}
+TEST_F(olRetainQueueTest, Success) { ASSERT_SUCCESS(olRetainQueue(Queue)); }

>From 20acc170d6b26ff739dbc17c4d80531dba414ab4 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Wed, 5 Feb 2025 16:00:38 +0000
Subject: [PATCH 10/17] Fix leak in olReleaseQueue

---
 offload/liboffload/src/OffloadImpl.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index a2d3730d3b303..7dcb5d935d535 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -364,7 +364,9 @@ ol_impl_result_t olRetainQueue_impl(ol_queue_handle_t Queue) {
 }
 
 ol_impl_result_t olReleaseQueue_impl(ol_queue_handle_t Queue) {
-  Queue->RefCount--;
+  if (--Queue->RefCount == 0) {
+    delete Queue;
+  }
   return OL_SUCCESS;
 }
 

>From 3423f701beef7ea0823d99cd1468705a5f5f2c19 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Thu, 6 Feb 2025 16:49:19 +0000
Subject: [PATCH 11/17] General tidy up; improve documentation and formatting

---
 offload/liboffload/API/Common.td              |   7 +-
 offload/liboffload/API/Enqueue.td             |   5 +-
 offload/liboffload/API/Event.td               |   4 +-
 offload/liboffload/API/Kernel.td              |  22 ++-
 offload/liboffload/API/Program.td             |  10 +-
 offload/liboffload/API/Queue.td               |   8 +-
 .../liboffload/include/generated/OffloadAPI.h |  63 +++++----
 .../include/generated/OffloadEntryPoints.inc  |   4 +
 .../include/generated/OffloadPrint.hpp        |  17 +--
 offload/liboffload/src/OffloadImpl.cpp        | 125 ++++++++----------
 10 files changed, 142 insertions(+), 123 deletions(-)

diff --git a/offload/liboffload/API/Common.td b/offload/liboffload/API/Common.td
index 7fedb2002f157..a0a2697e27e77 100644
--- a/offload/liboffload/API/Common.td
+++ b/offload/liboffload/API/Common.td
@@ -89,12 +89,11 @@ def : Enum {
     Etor<"SUCCESS", "Success">,
     Etor<"INVALID_VALUE", "Invalid Value">,
     Etor<"INVALID_PLATFORM", "Invalid platform">,
-    Etor<"DEVICE_NOT_FOUND", "Device not found">,
     Etor<"INVALID_DEVICE", "Invalid device">,
-    Etor<"DEVICE_LOST", "Device hung, reset, was removed, or driver update occurred">,
-    Etor<"UNINITIALIZED", "plugin is not initialized or specific entry-point is not implemented">,
+    Etor<"INVALID_QUEUE", "Invalid queue">,
+    Etor<"INVALID_EVENT", "Invalid event">,
+    Etor<"INVALID_KERNEL_NAME", "Named kernel not found in the program binary">,
     Etor<"OUT_OF_RESOURCES", "Out of resources">,
-    Etor<"UNSUPPORTED_VERSION", "generic error code for unsupported versions">,
     Etor<"UNSUPPORTED_FEATURE", "generic error code for unsupported features">,
     Etor<"INVALID_ARGUMENT", "generic error code for invalid arguments">,
     Etor<"INVALID_NULL_HANDLE", "handle argument is not valid">,
diff --git a/offload/liboffload/API/Enqueue.td b/offload/liboffload/API/Enqueue.td
index d9215e8175ef8..f503bf3c44139 100644
--- a/offload/liboffload/API/Enqueue.td
+++ b/offload/liboffload/API/Enqueue.td
@@ -21,7 +21,9 @@ def : Function {
         Param<"size_t", "Size", "size in bytes of data to copy", PARAM_IN>,
         Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
     ];
-    let returns = [];
+    let returns = [
+        Return<"OL_ERRC_INVALID_SIZE", ["`Size == 0`"]>
+    ];
 }
 
 def : Function {
@@ -53,7 +55,6 @@ def : Function {
     let returns = [];
 }
 
-
 def : Struct {
     let name = "ol_kernel_launch_size_args_t";
     let desc = "Size-related arguments for a kernel launch.";
diff --git a/offload/liboffload/API/Event.td b/offload/liboffload/API/Event.td
index db90a7c8e2be4..836a4755f3c87 100644
--- a/offload/liboffload/API/Event.td
+++ b/offload/liboffload/API/Event.td
@@ -12,7 +12,7 @@
 
 def : Function {
     let name = "olRetainEvent";
-    let desc = "Increment the reference count of the given event";
+    let desc = "Increment the event's reference count";
     let details = [];
     let params = [
         Param<"ol_event_handle_t", "Event", "handle of the event", PARAM_IN>
@@ -22,7 +22,7 @@ def : Function {
 
 def : Function {
     let name = "olReleaseEvent";
-    let desc = "Decrement the reference count of the given event";
+    let desc = "Decrement the event's reference count, and free it if the reference count reaches 0";
     let details = [];
     let params = [
         Param<"ol_event_handle_t", "Event", "handle of the event", PARAM_IN>
diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 4c8c84e9c71de..cad738c56b3a3 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -1,7 +1,21 @@
+//===-- Kernel.td - Kernel definitions for Offload ---------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains Offload API definitions related to the kernel handle
+//
+//===----------------------------------------------------------------------===//
+
 def : Function {
     let name = "olCreateKernel";
-    let desc = "";
-    let details = [];
+    let desc = "Create a kernel from the function identified by `KernelName` in the given program";
+    let details = [
+        "The created kernel has an initial reference count of 1."
+    ];
     let params = [
         Param<"ol_program_handle_t", "Program", "handle of the program", PARAM_IN>,
         Param<"const char*", "KernelName", "name of the kernel entry point in the program", PARAM_IN>,
@@ -12,7 +26,7 @@ def : Function {
 
 def : Function {
     let name = "olRetainKernel";
-    let desc = "Increment the reference count of the given kernel";
+    let desc = "Increment the kernel's reference count";
     let details = [];
     let params = [
         Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>
@@ -22,7 +36,7 @@ def : Function {
 
 def : Function {
     let name = "olReleaseKernel";
-    let desc = "Decrement the reference count of the given kernel";
+    let desc = "Decrement the kernel's reference count, and free it if the reference count reaches 0";
     let details = [];
     let params = [
         Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>
diff --git a/offload/liboffload/API/Program.td b/offload/liboffload/API/Program.td
index 684a6581320f8..d928f3d0cc2f1 100644
--- a/offload/liboffload/API/Program.td
+++ b/offload/liboffload/API/Program.td
@@ -12,8 +12,10 @@
 
 def : Function {
     let name = "olCreateProgram";
-    let desc = "";
-    let details = [];
+    let desc = "Create a program for the device from the binary image pointed to by `ProgData`";
+    let details = [
+        "The created program has an initial reference count of 1."
+    ];
     let params = [
         Param<"ol_device_handle_t", "Device", "handle of the device", PARAM_IN>,
         Param<"void*", "ProgData", "pointer to the program binary data", PARAM_IN>,
@@ -25,7 +27,7 @@ def : Function {
 
 def : Function {
     let name = "olRetainProgram";
-    let desc = "Create a queue for the given device";
+    let desc = "Increment the program's reference count";
     let details = [];
     let params = [
         Param<"ol_program_handle_t", "Program", "handle of the program", PARAM_IN>
@@ -35,7 +37,7 @@ def : Function {
 
 def : Function {
     let name = "olReleaseProgram";
-    let desc = "Create a queue for the given device";
+    let desc = "Decrement the program's reference count, and free it if the reference count reaches 0";
     let details = [];
     let params = [
         Param<"ol_program_handle_t", "Program", "handle of the program", PARAM_IN>
diff --git a/offload/liboffload/API/Queue.td b/offload/liboffload/API/Queue.td
index 5629fa40d56d5..786840a8e2141 100644
--- a/offload/liboffload/API/Queue.td
+++ b/offload/liboffload/API/Queue.td
@@ -13,7 +13,9 @@
 def : Function {
     let name = "olCreateQueue";
     let desc = "Create a queue for the given device";
-    let details = [];
+    let details = [
+        "The created queue has an initial reference count of 1."
+    ];
     let params = [
         Param<"ol_device_handle_t", "Device", "handle of the device", PARAM_IN>,
         Param<"ol_queue_handle_t*", "Queue", "output pointer for the created queue", PARAM_OUT>
@@ -23,7 +25,7 @@ def : Function {
 
 def : Function {
     let name = "olRetainQueue";
-    let desc = "Create a queue for the given device";
+    let desc = "Increment the queue's reference count.";
     let details = [];
     let params = [
         Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>
@@ -33,7 +35,7 @@ def : Function {
 
 def : Function {
     let name = "olReleaseQueue";
-    let desc = "Create a queue for the given device";
+    let desc = "Decrement the queues's reference count, and free it if the reference count reaches 0";
     let details = [];
     let params = [
         Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>
diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index 950c0e37ae67c..f8683af811ef7 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -110,34 +110,32 @@ typedef enum ol_errc_t {
   OL_ERRC_INVALID_VALUE = 1,
   /// Invalid platform
   OL_ERRC_INVALID_PLATFORM = 2,
-  /// Device not found
-  OL_ERRC_DEVICE_NOT_FOUND = 3,
   /// Invalid device
-  OL_ERRC_INVALID_DEVICE = 4,
-  /// Device hung, reset, was removed, or driver update occurred
-  OL_ERRC_DEVICE_LOST = 5,
-  /// plugin is not initialized or specific entry-point is not implemented
-  OL_ERRC_UNINITIALIZED = 6,
+  OL_ERRC_INVALID_DEVICE = 3,
+  /// Invalid queue
+  OL_ERRC_INVALID_QUEUE = 4,
+  /// Invalid event
+  OL_ERRC_INVALID_EVENT = 5,
+  /// Named kernel not found in the program binary
+  OL_ERRC_INVALID_KERNEL_NAME = 6,
   /// Out of resources
   OL_ERRC_OUT_OF_RESOURCES = 7,
-  /// generic error code for unsupported versions
-  OL_ERRC_UNSUPPORTED_VERSION = 8,
   /// generic error code for unsupported features
-  OL_ERRC_UNSUPPORTED_FEATURE = 9,
+  OL_ERRC_UNSUPPORTED_FEATURE = 8,
   /// generic error code for invalid arguments
-  OL_ERRC_INVALID_ARGUMENT = 10,
+  OL_ERRC_INVALID_ARGUMENT = 9,
   /// handle argument is not valid
-  OL_ERRC_INVALID_NULL_HANDLE = 11,
+  OL_ERRC_INVALID_NULL_HANDLE = 10,
   /// pointer argument may not be nullptr
-  OL_ERRC_INVALID_NULL_POINTER = 12,
+  OL_ERRC_INVALID_NULL_POINTER = 11,
   /// invalid size or dimensions (e.g., must not be zero, or is out of bounds)
-  OL_ERRC_INVALID_SIZE = 13,
+  OL_ERRC_INVALID_SIZE = 12,
   /// enumerator argument is not valid
-  OL_ERRC_INVALID_ENUMERATION = 14,
+  OL_ERRC_INVALID_ENUMERATION = 13,
   /// enumerator argument is not supported by the device
-  OL_ERRC_UNSUPPORTED_ENUMERATION = 15,
+  OL_ERRC_UNSUPPORTED_ENUMERATION = 14,
   /// Unknown or internal error
-  OL_ERRC_UNKNOWN = 16,
+  OL_ERRC_UNKNOWN = 15,
   /// @cond
   OL_ERRC_FORCE_UINT32 = 0x7fffffff
   /// @endcond
@@ -541,6 +539,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olMemFree(
 /// @brief Create a queue for the given device
 ///
 /// @details
+///    - The created queue has an initial reference count of 1.
 ///
 /// @returns
 ///     - ::OL_RESULT_SUCCESS
@@ -557,7 +556,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olCreateQueue(
     ol_queue_handle_t *Queue);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Create a queue for the given device
+/// @brief Increment the queue's reference count.
 ///
 /// @details
 ///
@@ -573,7 +572,8 @@ OL_APIEXPORT ol_result_t OL_APICALL olRetainQueue(
     ol_queue_handle_t Queue);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Create a queue for the given device
+/// @brief Decrement the queues's reference count, and free it if the reference
+/// count reaches 0
 ///
 /// @details
 ///
@@ -605,7 +605,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olFinishQueue(
     ol_queue_handle_t Queue);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Increment the reference count of the given event
+/// @brief Increment the event's reference count
 ///
 /// @details
 ///
@@ -621,7 +621,8 @@ OL_APIEXPORT ol_result_t OL_APICALL olRetainEvent(
     ol_event_handle_t Event);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Decrement the reference count of the given event
+/// @brief Decrement the event's reference count, and free it if the reference
+/// count reaches 0
 ///
 /// @details
 ///
@@ -661,6 +662,8 @@ OL_APIEXPORT ol_result_t OL_APICALL olWaitEvent(
 ///     - ::OL_RESULT_SUCCESS
 ///     - ::OL_ERRC_UNINITIALIZED
 ///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_SIZE
+///         + `Size == 0`
 ///     - ::OL_ERRC_INVALID_NULL_HANDLE
 ///         + `NULL == Queue`
 ///     - ::OL_ERRC_INVALID_NULL_POINTER
@@ -770,9 +773,11 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueKernelLaunch(
     ol_event_handle_t *EventOut);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief
+/// @brief Create a program for the device from the binary image pointed to by
+/// `ProgData`
 ///
 /// @details
+///    - The created program has an initial reference count of 1.
 ///
 /// @returns
 ///     - ::OL_RESULT_SUCCESS
@@ -794,7 +799,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olCreateProgram(
     ol_program_handle_t *Queue);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Create a queue for the given device
+/// @brief Increment the program's reference count
 ///
 /// @details
 ///
@@ -810,7 +815,8 @@ OL_APIEXPORT ol_result_t OL_APICALL olRetainProgram(
     ol_program_handle_t Program);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Create a queue for the given device
+/// @brief Decrement the program's reference count, and free it if the reference
+/// count reaches 0
 ///
 /// @details
 ///
@@ -826,9 +832,11 @@ OL_APIEXPORT ol_result_t OL_APICALL olReleaseProgram(
     ol_program_handle_t Program);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief
+/// @brief Create a kernel from the function identified by `KernelName` in the
+/// given program
 ///
 /// @details
+///    - The created kernel has an initial reference count of 1.
 ///
 /// @returns
 ///     - ::OL_RESULT_SUCCESS
@@ -848,7 +856,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olCreateKernel(
     ol_kernel_handle_t *Kernel);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Increment the reference count of the given kernel
+/// @brief Increment the kernel's reference count
 ///
 /// @details
 ///
@@ -864,7 +872,8 @@ OL_APIEXPORT ol_result_t OL_APICALL olRetainKernel(
     ol_kernel_handle_t Kernel);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Decrement the reference count of the given kernel
+/// @brief Decrement the kernel's reference count, and free it if the reference
+/// count reaches 0
 ///
 /// @details
 ///
diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc
index 6d8f1d7c7171f..ffef36b5a9ac4 100644
--- a/offload/liboffload/include/generated/OffloadEntryPoints.inc
+++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc
@@ -798,6 +798,10 @@ ol_impl_result_t olEnqueueDataWrite_val(ol_queue_handle_t Queue, void *SrcPtr,
                                         void *DstPtr, size_t Size,
                                         ol_event_handle_t *EventOut) {
   if (true /*enableParameterValidation*/) {
+    if (Size == 0) {
+      return OL_ERRC_INVALID_SIZE;
+    }
+
     if (NULL == Queue) {
       return OL_ERRC_INVALID_NULL_HANDLE;
     }
diff --git a/offload/liboffload/include/generated/OffloadPrint.hpp b/offload/liboffload/include/generated/OffloadPrint.hpp
index 5271832451dd6..56ab655a4ae74 100644
--- a/offload/liboffload/include/generated/OffloadPrint.hpp
+++ b/offload/liboffload/include/generated/OffloadPrint.hpp
@@ -51,24 +51,21 @@ inline std::ostream &operator<<(std::ostream &os, enum ol_errc_t value) {
   case OL_ERRC_INVALID_PLATFORM:
     os << "OL_ERRC_INVALID_PLATFORM";
     break;
-  case OL_ERRC_DEVICE_NOT_FOUND:
-    os << "OL_ERRC_DEVICE_NOT_FOUND";
-    break;
   case OL_ERRC_INVALID_DEVICE:
     os << "OL_ERRC_INVALID_DEVICE";
     break;
-  case OL_ERRC_DEVICE_LOST:
-    os << "OL_ERRC_DEVICE_LOST";
+  case OL_ERRC_INVALID_QUEUE:
+    os << "OL_ERRC_INVALID_QUEUE";
+    break;
+  case OL_ERRC_INVALID_EVENT:
+    os << "OL_ERRC_INVALID_EVENT";
     break;
-  case OL_ERRC_UNINITIALIZED:
-    os << "OL_ERRC_UNINITIALIZED";
+  case OL_ERRC_INVALID_KERNEL_NAME:
+    os << "OL_ERRC_INVALID_KERNEL_NAME";
     break;
   case OL_ERRC_OUT_OF_RESOURCES:
     os << "OL_ERRC_OUT_OF_RESOURCES";
     break;
-  case OL_ERRC_UNSUPPORTED_VERSION:
-    os << "OL_ERRC_UNSUPPORTED_VERSION";
-    break;
   case OL_ERRC_UNSUPPORTED_FEATURE:
     os << "OL_ERRC_UNSUPPORTED_FEATURE";
     break;
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 7dcb5d935d535..8be4c76030b36 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -53,7 +53,9 @@ struct ol_program_handle_t_ {
   std::atomic_uint32_t RefCount;
 };
 
-struct OffloadArguments {
+// A helper that can be used to construct the argument buffer for a kernel.
+// Alternatively, a pre-existing buffer can be set with `setArgsData`.
+struct OffloadKernelArguments {
   static constexpr size_t MaxParamBytes = 4096u;
   using args_t = std::array<char, MaxParamBytes>;
   using args_size_t = std::vector<size_t>;
@@ -94,7 +96,7 @@ struct ol_kernel_handle_t_ {
   ol_program_handle_t Program;
   std::atomic_uint32_t RefCount;
   GenericKernelTy *KernelImpl;
-  OffloadArguments Args;
+  OffloadKernelArguments Args;
 };
 
 using PlatformVecT = SmallVector<ol_platform_handle_t_, 4>;
@@ -238,9 +240,8 @@ ol_impl_result_t olGetDeviceCount_impl(ol_platform_handle_t Platform,
 ol_impl_result_t olGetDevice_impl(ol_platform_handle_t Platform,
                                   uint32_t NumEntries,
                                   ol_device_handle_t *Devices) {
-  if (NumEntries > Platform->Devices.size()) {
+  if (NumEntries > Platform->Devices.size())
     return OL_ERRC_INVALID_SIZE;
-  }
 
   for (uint32_t DeviceIndex = 0; DeviceIndex < NumEntries; DeviceIndex++) {
     Devices[DeviceIndex] = &(Platform->Devices[DeviceIndex]);
@@ -326,10 +327,9 @@ ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device,
                                  void **AllocationOut) {
   auto Alloc =
       Device->Device.dataAlloc(Size, nullptr, convertOlToPluginAllocTy(Type));
-  if (!Alloc) {
+  if (!Alloc)
     return {OL_ERRC_OUT_OF_RESOURCES,
             formatv("Could not create allocation on device {0}", Device).str()};
-  }
 
   *AllocationOut = *Alloc;
   return OL_SUCCESS;
@@ -338,9 +338,9 @@ ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device,
 ol_impl_result_t olMemFree_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
                                 void *Address) {
   auto Res = Device->Device.dataDelete(Address, convertOlToPluginAllocTy(Type));
-  if (Res) {
+  if (Res)
     return {OL_ERRC_OUT_OF_RESOURCES, "Could not free allocation"};
-  }
+
   return OL_SUCCESS;
 }
 
@@ -348,10 +348,9 @@ ol_impl_result_t olCreateQueue_impl(ol_device_handle_t Device,
                                     ol_queue_handle_t *Queue) {
   auto CreatedQueue = std::make_unique<ol_queue_handle_t_>();
   auto Err = Device->Device.initAsyncInfo(&(CreatedQueue->AsyncInfo));
-  if (Err) {
-    return OL_ERRC_OUT_OF_RESOURCES;
-  }
-  // TODO: Check error
+  if (Err)
+    return {OL_ERRC_UNKNOWN, "Could not initialize stream resource"};
+
   CreatedQueue->Device = Device;
   CreatedQueue->RefCount = 1;
   *Queue = CreatedQueue.release();
@@ -364,9 +363,9 @@ ol_impl_result_t olRetainQueue_impl(ol_queue_handle_t Queue) {
 }
 
 ol_impl_result_t olReleaseQueue_impl(ol_queue_handle_t Queue) {
-  if (--Queue->RefCount == 0) {
+  if (--Queue->RefCount == 0)
     delete Queue;
-  }
+
   return OL_SUCCESS;
 }
 
@@ -375,27 +374,25 @@ ol_impl_result_t olFinishQueue_impl(ol_queue_handle_t Queue) {
   // on it, but we have nothing to synchronize in that situation anyway.
   if (Queue->AsyncInfo->Queue) {
     auto Err = Queue->Device->Device.synchronize(Queue->AsyncInfo);
-    if (Err) {
-      return OL_ERRC_OUT_OF_RESOURCES;
-    }
+    if (Err)
+      return {OL_ERRC_INVALID_QUEUE, "The queue failed to synchronize"};
   }
 
   // Recreate the stream resource so the queue can be reused
   // TODO: Would be easier for the synchronization to (optionally) not release
   // it to begin with.
   auto Res = Queue->Device->Device.initAsyncInfo(&Queue->AsyncInfo);
-  if (Res) {
-    return OL_ERRC_OUT_OF_RESOURCES;
-  }
+  if (Res)
+    return {OL_ERRC_UNKNOWN, "Could not reinitialize the stream resource"};
 
   return OL_SUCCESS;
 }
 
 ol_impl_result_t olWaitEvent_impl(ol_event_handle_t Event) {
   auto Res = Event->Device->Device.syncEvent(Event->EventInfo);
-  if (Res) {
-    return OL_ERRC_OUT_OF_RESOURCES;
-  }
+  if (Res)
+    return {OL_ERRC_INVALID_EVENT, "The event failed to synchronize"};
+
   return OL_SUCCESS;
 }
 
@@ -405,7 +402,9 @@ ol_impl_result_t olRetainEvent_impl(ol_event_handle_t Event) {
 }
 
 ol_impl_result_t olReleaseEvent_impl(ol_event_handle_t Event) {
-  Event->RefCount--;
+  if (--Event->RefCount == 0)
+    delete Event;
+
   return OL_SUCCESS;
 }
 
@@ -413,14 +412,13 @@ ol_event_handle_t makeEvent(ol_queue_handle_t Queue) {
   auto EventImpl = std::make_unique<ol_event_handle_t_>();
   EventImpl->Queue = Queue;
   auto Res = Queue->Device->Device.createEvent(&EventImpl->EventInfo);
-  if (Res) {
+  if (Res)
     return nullptr;
-  }
+
   Res =
       Queue->Device->Device.recordEvent(EventImpl->EventInfo, Queue->AsyncInfo);
-  if (Res) {
+  if (Res)
     return nullptr;
-  }
 
   return EventImpl.release();
 }
@@ -432,13 +430,11 @@ ol_impl_result_t olEnqueueDataWrite_impl(ol_queue_handle_t Queue, void *SrcPtr,
 
   auto Res = DeviceImpl.dataSubmit(DstPtr, SrcPtr, Size, Queue->AsyncInfo);
 
-  if (Res) {
-    return OL_ERRC_OUT_OF_RESOURCES;
-  }
+  if (Res)
+    return {OL_ERRC_UNKNOWN, "The data submit operation failed"};
 
-  if (EventOut) {
+  if (EventOut)
     *EventOut = makeEvent(Queue);
-  }
 
   return OL_SUCCESS;
 }
@@ -450,13 +446,11 @@ ol_impl_result_t olEnqueueDataRead_impl(ol_queue_handle_t Queue, void *SrcPtr,
 
   auto Res = DeviceImpl.dataRetrieve(DstPtr, SrcPtr, Size, Queue->AsyncInfo);
 
-  if (Res) {
-    return OL_ERRC_OUT_OF_RESOURCES;
-  }
+  if (Res)
+    return {OL_ERRC_UNKNOWN, "The data retrieve operation failed"};
 
-  if (EventOut) {
+  if (EventOut)
     *EventOut = makeEvent(Queue);
-  }
 
   return OL_SUCCESS;
 }
@@ -471,13 +465,11 @@ ol_impl_result_t olEnqueueDataCopy_impl(ol_queue_handle_t Queue, void *SrcPtr,
   auto Res = DeviceImpl.dataExchange(SrcPtr, DstDevice->Device, DstPtr, Size,
                                      Queue->AsyncInfo);
 
-  if (Res) {
-    return OL_ERRC_OUT_OF_RESOURCES;
-  }
+  if (Res)
+    return {OL_ERRC_UNKNOWN, "The data exchange operation failed"};
 
-  if (EventOut) {
+  if (EventOut)
     *EventOut = makeEvent(Queue);
-  }
 
   return OL_SUCCESS;
 }
@@ -485,12 +477,14 @@ ol_impl_result_t olEnqueueDataCopy_impl(ol_queue_handle_t Queue, void *SrcPtr,
 ol_impl_result_t olCreateProgram_impl(ol_device_handle_t Device, void *ProgData,
                                       size_t ProgDataSize,
                                       ol_program_handle_t *Program) {
+  // Make a copy of the program binary in case it is released by the caller.
+  // TODO: Make this copy optional.
   auto ImageData = MemoryBuffer::getMemBufferCopy(
       StringRef(reinterpret_cast<char *>(ProgData), ProgDataSize));
-  __tgt_device_image DeviceImage{(char *)ImageData->getBuffer().data(),
-                                 ((char *)ImageData->getBuffer().data()) +
-                                     ProgDataSize - 1,
-                                 nullptr, nullptr};
+  __tgt_device_image DeviceImage{
+      const_cast<char *>(ImageData->getBuffer().data()),
+      const_cast<char *>(ImageData->getBuffer().data()) + ProgDataSize - 1,
+      nullptr, nullptr};
 
   ol_program_handle_t Prog = new ol_program_handle_t_();
 
@@ -507,14 +501,14 @@ ol_impl_result_t olCreateProgram_impl(ol_device_handle_t Device, void *ProgData,
 }
 
 ol_impl_result_t olRetainProgram_impl(ol_program_handle_t Program) {
-  ++Program->RefCount;
+  Program->RefCount++;
   return OL_SUCCESS;
 }
 
 ol_impl_result_t olReleaseProgram_impl(ol_program_handle_t Program) {
-  if (--Program->RefCount == 0) {
+  if (--Program->RefCount == 0)
     delete Program;
-  }
+
   return OL_SUCCESS;
 }
 
@@ -524,14 +518,12 @@ ol_impl_result_t olCreateKernel_impl(ol_program_handle_t Program,
 
   auto &Device = Program->Image->getDevice();
   auto KernelImpl = Device.constructKernel(KernelName);
-  if (!KernelImpl) {
-    return OL_ERRC_OUT_OF_RESOURCES;
-  }
+  if (!KernelImpl)
+    return OL_ERRC_INVALID_KERNEL_NAME;
 
   auto Err = KernelImpl->init(Device, *Program->Image);
-  if (Err) {
-    return OL_ERRC_OUT_OF_RESOURCES;
-  }
+  if (Err)
+    return {OL_ERRC_UNKNOWN, "Could not initialize the kernel"};
 
   ol_kernel_handle_t CreatedKernel = new ol_kernel_handle_t_();
   CreatedKernel->Program = Program;
@@ -548,9 +540,9 @@ ol_impl_result_t olRetainKernel_impl(ol_kernel_handle_t Kernel) {
 }
 
 ol_impl_result_t olReleaseKernel_impl(ol_kernel_handle_t Kernel) {
-  if (--Kernel->RefCount == 0) {
+  if (--Kernel->RefCount == 0)
     delete Kernel;
-  }
+
   return OL_SUCCESS;
 }
 
@@ -579,23 +571,22 @@ olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
   LaunchArgs.ThreadLimit[1] = LaunchSizeArgs->GroupSizeY;
   LaunchArgs.ThreadLimit[2] = LaunchSizeArgs->GroupSizeZ;
 
-  LaunchArgs.ArgPtrs = (void **)Kernel->Args.getStorage();
+  LaunchArgs.ArgPtrs =
+      reinterpret_cast<void **>(const_cast<char *>(Kernel->Args.getStorage()));
 
   // No offsets needed, arguments are real pointers
   auto ArgOffsets = std::vector<ptrdiff_t>(LaunchArgs.NumArgs, 0ul);
 
-  auto Err = Kernel->KernelImpl->launch(
-      DeviceImpl, (void **)Kernel->Args.getStorage(), ArgOffsets.data(),
-      LaunchArgs, AsyncInfoWrapper);
+  auto Err = Kernel->KernelImpl->launch(DeviceImpl, LaunchArgs.ArgPtrs,
+                                        ArgOffsets.data(), LaunchArgs,
+                                        AsyncInfoWrapper);
 
   AsyncInfoWrapper.finalize(Err);
-  if (Err) {
-    return OL_ERRC_OUT_OF_RESOURCES;
-  }
+  if (Err)
+    return {OL_ERRC_UNKNOWN, "Could not finalize the AsyncInfoWrapper"};
 
-  if (EventOut) {
+  if (EventOut)
     *EventOut = makeEvent(Queue);
-  }
 
   return OL_SUCCESS;
 }

>From 44122e140ea2b7f374e2315cb8e5fc972dec808b Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Fri, 7 Feb 2025 13:56:08 +0000
Subject: [PATCH 12/17] Revert plugin changes

The offload unit tests will no longer work on host
Kernel execution will no longer work on CUDA
---
 .../common/include/GlobalHandler.h            |  5 ++--
 offload/plugins-nextgen/cuda/src/rtl.cpp      | 28 -------------------
 offload/plugins-nextgen/host/src/rtl.cpp      |  2 +-
 3 files changed, 3 insertions(+), 32 deletions(-)

diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h
index d65fceb8508d2..d2914e7cd0eb4 100644
--- a/offload/plugins-nextgen/common/include/GlobalHandler.h
+++ b/offload/plugins-nextgen/common/include/GlobalHandler.h
@@ -131,9 +131,8 @@ class GenericGlobalHandlerTy {
 
   /// Get the address and size of a global in the image. Address and size are
   /// return in \p ImageGlobal, the global name is passed in \p ImageGlobal.
-  virtual Error getGlobalMetadataFromImage(GenericDeviceTy &Device,
-                                           DeviceImageTy &Image,
-                                           GlobalTy &ImageGlobal);
+  Error getGlobalMetadataFromImage(GenericDeviceTy &Device,
+                                   DeviceImageTy &Image, GlobalTy &ImageGlobal);
 
   /// Read the memory associated with a global from the image and store it on
   /// the host. The name, size, and destination are defined by \p HostGlobal.
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index d7a69091ada74..894d1c2214b97 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1327,34 +1327,6 @@ class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy {
     DeviceGlobal.setPtr(reinterpret_cast<void *>(CUPtr));
     return Plugin::success();
   }
-
-  Error getGlobalMetadataFromImage(GenericDeviceTy &Device,
-                                   DeviceImageTy &Image,
-                                   GlobalTy &ImageGlobal) override {
-    // If the image is an ELF we can use the generic path, otherwise fall back
-    // and use cuModuleGetGlobal to query the image.
-    if (utils::elf::isELF(Image.getMemoryBuffer().getBuffer())) {
-      return GenericGlobalHandlerTy::getGlobalMetadataFromImage(Device, Image,
-                                                                ImageGlobal);
-    }
-
-    CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
-
-    const char *GlobalName = ImageGlobal.getName().data();
-
-    size_t CUSize;
-    CUdeviceptr CUPtr;
-    CUresult Res =
-        cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), GlobalName);
-    if (auto Err = Plugin::check(Res, "Error in cuModuleGetGlobal for '%s': %s",
-                                 GlobalName))
-      return Err;
-
-    // Setup the global symbol's address and size.
-    ImageGlobal.setPtr(reinterpret_cast<void *>(CUPtr));
-    ImageGlobal.setSize(CUSize);
-    return Plugin::success();
-  }
 };
 
 /// Class implementing the CUDA-specific functionalities of the plugin.
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 1ba9a49f4f9af..1d4db95fff500 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -289,7 +289,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
 
   /// This plugin does not support interoperability, do nothing
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    return Plugin::success();
+    return Plugin::error("initAsyncInfoImpl not supported");
   }
 
   /// This plugin does not support interoperability

>From 2aea02229fcc7833db9ed2fc1bb90b74aa5a1d25 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Fri, 7 Feb 2025 15:07:43 +0000
Subject: [PATCH 13/17] Rename `ol_*_handle_t_` -> `ol_*_impl_t`

---
 .../liboffload/include/generated/OffloadAPI.h | 14 ++++----
 offload/liboffload/src/OffloadImpl.cpp        | 32 ++++++++++---------
 offload/tools/offload-tblgen/APIGen.cpp       |  9 +++++-
 3 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index f8683af811ef7..e463efd41cce1 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -75,31 +75,31 @@ extern "C" {
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Handle of a platform instance
-typedef struct ol_platform_handle_t_ *ol_platform_handle_t;
+typedef struct ol_platform_impl_t *ol_platform_handle_t;
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Handle of platform's device object
-typedef struct ol_device_handle_t_ *ol_device_handle_t;
+typedef struct ol_device_impl_t *ol_device_handle_t;
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Handle of context object
-typedef struct ol_context_handle_t_ *ol_context_handle_t;
+typedef struct ol_context_impl_t *ol_context_handle_t;
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Handle of queue object
-typedef struct ol_queue_handle_t_ *ol_queue_handle_t;
+typedef struct ol_queue_impl_t *ol_queue_handle_t;
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Handle of event object
-typedef struct ol_event_handle_t_ *ol_event_handle_t;
+typedef struct ol_event_impl_t *ol_event_handle_t;
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Handle of program object
-typedef struct ol_program_handle_t_ *ol_program_handle_t;
+typedef struct ol_program_impl_t *ol_program_handle_t;
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Handle of kernel object
-typedef struct ol_kernel_handle_t_ *ol_kernel_handle_t;
+typedef struct ol_kernel_impl_t *ol_kernel_handle_t;
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Defines Return/Error codes
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 8be4c76030b36..89cc42823261f 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -22,32 +22,34 @@
 using namespace llvm;
 using namespace llvm::omp::target::plugin;
 
-// Handle type definitions. Ideally these would be 1:1 with the plugins
-struct ol_device_handle_t_ {
+// Handle type definitions. Ideally these would be 1:1 with the plugins, but
+// we add some additional data here for now to avoid churn in the plugin
+// interface.
+struct ol_device_impl_t {
   int DeviceNum;
   GenericDeviceTy &Device;
   ol_platform_handle_t Platform;
 };
 
-struct ol_platform_handle_t_ {
+struct ol_platform_impl_t {
   std::unique_ptr<GenericPluginTy> Plugin;
-  std::vector<ol_device_handle_t_> Devices;
+  std::vector<ol_device_impl_t> Devices;
 };
 
-struct ol_queue_handle_t_ {
+struct ol_queue_impl_t {
   __tgt_async_info *AsyncInfo;
   ol_device_handle_t Device;
   std::atomic_uint32_t RefCount;
 };
 
-struct ol_event_handle_t_ {
+struct ol_event_impl_t {
   void *EventInfo;
   ol_queue_handle_t Queue;
   ol_device_handle_t Device;
   std::atomic_uint32_t RefCount;
 };
 
-struct ol_program_handle_t_ {
+struct ol_program_impl_t {
   llvm::omp::target::plugin::DeviceImageTy *Image;
   std::unique_ptr<MemoryBuffer> ImageData;
   std::atomic_uint32_t RefCount;
@@ -92,14 +94,14 @@ struct OffloadKernelArguments {
   const char *getStorage() const noexcept { return Storage.data(); }
 };
 
-struct ol_kernel_handle_t_ {
+struct ol_kernel_impl_t {
   ol_program_handle_t Program;
   std::atomic_uint32_t RefCount;
   GenericKernelTy *KernelImpl;
   OffloadKernelArguments Args;
 };
 
-using PlatformVecT = SmallVector<ol_platform_handle_t_, 4>;
+using PlatformVecT = SmallVector<ol_platform_impl_t, 4>;
 PlatformVecT &Platforms() {
   static PlatformVecT Platforms;
   return Platforms;
@@ -128,7 +130,7 @@ void initPlugins() {
   // Attempt to create an instance of each supported plugin.
 #define PLUGIN_TARGET(Name)                                                    \
   do {                                                                         \
-    Platforms().emplace_back(ol_platform_handle_t_{                            \
+    Platforms().emplace_back(ol_platform_impl_t{                            \
         std::unique_ptr<GenericPluginTy>(createPlugin_##Name()), {}});         \
   } while (false);
 #include "Shared/Targets.def"
@@ -141,7 +143,7 @@ void initPlugins() {
     for (auto DevNum = 0; DevNum < Platform.Plugin->number_of_devices();
          DevNum++) {
       if (Platform.Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) {
-        Platform.Devices.emplace_back(ol_device_handle_t_{
+        Platform.Devices.emplace_back(ol_device_impl_t{
             DevNum, Platform.Plugin->getDevice(DevNum), &Platform});
       }
     }
@@ -346,7 +348,7 @@ ol_impl_result_t olMemFree_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
 
 ol_impl_result_t olCreateQueue_impl(ol_device_handle_t Device,
                                     ol_queue_handle_t *Queue) {
-  auto CreatedQueue = std::make_unique<ol_queue_handle_t_>();
+  auto CreatedQueue = std::make_unique<ol_queue_impl_t>();
   auto Err = Device->Device.initAsyncInfo(&(CreatedQueue->AsyncInfo));
   if (Err)
     return {OL_ERRC_UNKNOWN, "Could not initialize stream resource"};
@@ -409,7 +411,7 @@ ol_impl_result_t olReleaseEvent_impl(ol_event_handle_t Event) {
 }
 
 ol_event_handle_t makeEvent(ol_queue_handle_t Queue) {
-  auto EventImpl = std::make_unique<ol_event_handle_t_>();
+  auto EventImpl = std::make_unique<ol_event_impl_t>();
   EventImpl->Queue = Queue;
   auto Res = Queue->Device->Device.createEvent(&EventImpl->EventInfo);
   if (Res)
@@ -486,7 +488,7 @@ ol_impl_result_t olCreateProgram_impl(ol_device_handle_t Device, void *ProgData,
       const_cast<char *>(ImageData->getBuffer().data()) + ProgDataSize - 1,
       nullptr, nullptr};
 
-  ol_program_handle_t Prog = new ol_program_handle_t_();
+  ol_program_handle_t Prog = new ol_program_impl_t();
 
   auto Res = Device->Device.loadBinary(Device->Device.Plugin, &DeviceImage);
   if (!Res)
@@ -525,7 +527,7 @@ ol_impl_result_t olCreateKernel_impl(ol_program_handle_t Program,
   if (Err)
     return {OL_ERRC_UNKNOWN, "Could not initialize the kernel"};
 
-  ol_kernel_handle_t CreatedKernel = new ol_kernel_handle_t_();
+  ol_kernel_handle_t CreatedKernel = new ol_kernel_impl_t();
   CreatedKernel->Program = Program;
   CreatedKernel->RefCount = 1;
   CreatedKernel->KernelImpl = &*KernelImpl;
diff --git a/offload/tools/offload-tblgen/APIGen.cpp b/offload/tools/offload-tblgen/APIGen.cpp
index 97a2464f7a75c..8cc5bd5e452fe 100644
--- a/offload/tools/offload-tblgen/APIGen.cpp
+++ b/offload/tools/offload-tblgen/APIGen.cpp
@@ -41,9 +41,16 @@ static std::string MakeComment(StringRef in) {
 }
 
 static void ProcessHandle(const HandleRec &H, raw_ostream &OS) {
+  if (!H.getName().ends_with("_handle_t")) {
+    errs() << "Handle type name (" << H.getName()
+           << ") must end with '_handle_t'!\n";
+    exit(1);
+  }
+
+  auto ImplName = H.getName().substr(0, H.getName().size() - 9) + "_impl_t";
   OS << CommentsHeader;
   OS << formatv("/// @brief {0}\n", H.getDesc());
-  OS << formatv("typedef struct {0}_ *{0};\n", H.getName());
+  OS << formatv("typedef struct {0} *{1};\n", ImplName, H.getName());
 }
 
 static void ProcessTypedef(const TypedefRec &T, raw_ostream &OS) {

>From 5c121fa88ec33671c71babcd580216f4609b667e Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Tue, 11 Feb 2025 16:58:55 +0000
Subject: [PATCH 14/17] Various fixes to address review feedback

---
 offload/liboffload/API/Enqueue.td             |   8 +-
 offload/liboffload/API/Program.td             |   2 +-
 offload/liboffload/API/README.md              |   6 +-
 .../liboffload/include/generated/OffloadAPI.h |  46 +--
 .../include/generated/OffloadEntryPoints.inc  | 335 +++++++++---------
 .../generated/OffloadImplFuncDecls.inc        |  15 +-
 .../include/generated/OffloadPrint.hpp        |  24 +-
 offload/liboffload/src/OffloadImpl.cpp        |  13 +-
 .../tools/offload-tblgen/EntryPointGen.cpp    |  10 +-
 .../OffloadAPI/enqueue/olEnqueueDataCopy.cpp  |   6 +-
 .../OffloadAPI/enqueue/olEnqueueDataRead.cpp  |   4 +-
 .../OffloadAPI/enqueue/olEnqueueDataWrite.cpp |   2 +-
 12 files changed, 234 insertions(+), 237 deletions(-)

diff --git a/offload/liboffload/API/Enqueue.td b/offload/liboffload/API/Enqueue.td
index f503bf3c44139..695b157ac1de3 100644
--- a/offload/liboffload/API/Enqueue.td
+++ b/offload/liboffload/API/Enqueue.td
@@ -16,8 +16,8 @@ def : Function {
     let details = [];
     let params = [
         Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
-        Param<"void*", "SrcPtr", "host pointer to copy from", PARAM_IN>,
         Param<"void*", "DstPtr", "device pointer to copy to", PARAM_IN>,
+        Param<"void*", "SrcPtr", "host pointer to copy from", PARAM_IN>,
         Param<"size_t", "Size", "size in bytes of data to copy", PARAM_IN>,
         Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
     ];
@@ -32,8 +32,8 @@ def : Function {
     let details = [];
     let params = [
         Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
-        Param<"void*", "SrcPtr", "device pointer to copy from", PARAM_IN>,
         Param<"void*", "DstPtr", "host pointer to copy to", PARAM_IN>,
+        Param<"void*", "SrcPtr", "device pointer to copy from", PARAM_IN>,
         Param<"size_t", "Size", "size in bytes of data to copy", PARAM_IN>,
         Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
     ];
@@ -46,9 +46,9 @@ def : Function {
     let details = [];
     let params = [
         Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
-        Param<"void*", "SrcPtr", "device pointer to copy from", PARAM_IN>,
-        Param<"void*", "DstPtr", "device pointer to copy to", PARAM_IN>,
         Param<"ol_device_handle_t", "DstDevice", "device that the destination pointer is resident on", PARAM_IN>,
+        Param<"void*", "DstPtr", "device pointer to copy to", PARAM_IN>,
+        Param<"void*", "SrcPtr", "device pointer to copy from", PARAM_IN>,
         Param<"size_t", "Size", "size in bytes of data to copy", PARAM_IN>,
         Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
     ];
diff --git a/offload/liboffload/API/Program.td b/offload/liboffload/API/Program.td
index d928f3d0cc2f1..b2ea21f0877e6 100644
--- a/offload/liboffload/API/Program.td
+++ b/offload/liboffload/API/Program.td
@@ -20,7 +20,7 @@ def : Function {
         Param<"ol_device_handle_t", "Device", "handle of the device", PARAM_IN>,
         Param<"void*", "ProgData", "pointer to the program binary data", PARAM_IN>,
         Param<"size_t", "ProgDataSize", "size of the program binary in bytes", PARAM_IN>,
-        Param<"ol_program_handle_t*", "Queue", "output pointer for the created program", PARAM_OUT>
+        Param<"ol_program_handle_t*", "Program", "output pointer for the created program", PARAM_OUT>
     ];
     let returns = [];
 }
diff --git a/offload/liboffload/API/README.md b/offload/liboffload/API/README.md
index 38a055811b2d0..a205dc007b2ae 100644
--- a/offload/liboffload/API/README.md
+++ b/offload/liboffload/API/README.md
@@ -138,8 +138,8 @@ allow more backends to be easily added in future.
 
 A new object can be added to the API by adding to one of the existing `.td`
 files. It is also possible to add a new tablegen file to the API by adding it
-to the includes in `OffloadAPI.td`. When the offload target is rebuilt, the
-new definition will be included in the generated files.
+to the includes in `OffloadAPI.td`. When the `OffloadGenerate` target is
+rebuilt, the new definition will be included in the generated files.
 
 ### Adding a new entry point
 
@@ -147,4 +147,4 @@ When a new entry point is added (e.g. `offloadDeviceFoo`), the actual entry
 point is automatically generated, which contains validation and tracing code.
 It expects an implementation function (`offloadDeviceFoo_impl`) to be defined,
 which it will call into. The definition of this implementation function should
-be added to `src/offload_impl.cpp`
+be added to `src/OffloadImpl.cpp`
diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index e463efd41cce1..110d252fe45a7 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -667,15 +667,15 @@ OL_APIEXPORT ol_result_t OL_APICALL olWaitEvent(
 ///     - ::OL_ERRC_INVALID_NULL_HANDLE
 ///         + `NULL == Queue`
 ///     - ::OL_ERRC_INVALID_NULL_POINTER
-///         + `NULL == SrcPtr`
 ///         + `NULL == DstPtr`
+///         + `NULL == SrcPtr`
 OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataWrite(
     // [in] handle of the queue
     ol_queue_handle_t Queue,
-    // [in] host pointer to copy from
-    void *SrcPtr,
     // [in] device pointer to copy to
     void *DstPtr,
+    // [in] host pointer to copy from
+    void *SrcPtr,
     // [in] size in bytes of data to copy
     size_t Size,
     // [out][optional] optional recorded event for the enqueued operation
@@ -693,15 +693,15 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataWrite(
 ///     - ::OL_ERRC_INVALID_NULL_HANDLE
 ///         + `NULL == Queue`
 ///     - ::OL_ERRC_INVALID_NULL_POINTER
-///         + `NULL == SrcPtr`
 ///         + `NULL == DstPtr`
+///         + `NULL == SrcPtr`
 OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataRead(
     // [in] handle of the queue
     ol_queue_handle_t Queue,
-    // [in] device pointer to copy from
-    void *SrcPtr,
     // [in] host pointer to copy to
     void *DstPtr,
+    // [in] device pointer to copy from
+    void *SrcPtr,
     // [in] size in bytes of data to copy
     size_t Size,
     // [out][optional] optional recorded event for the enqueued operation
@@ -720,17 +720,17 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataRead(
 ///         + `NULL == Queue`
 ///         + `NULL == DstDevice`
 ///     - ::OL_ERRC_INVALID_NULL_POINTER
-///         + `NULL == SrcPtr`
 ///         + `NULL == DstPtr`
+///         + `NULL == SrcPtr`
 OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataCopy(
     // [in] handle of the queue
     ol_queue_handle_t Queue,
-    // [in] device pointer to copy from
-    void *SrcPtr,
-    // [in] device pointer to copy to
-    void *DstPtr,
     // [in] device that the destination pointer is resident on
     ol_device_handle_t DstDevice,
+    // [in] device pointer to copy to
+    void *DstPtr,
+    // [in] device pointer to copy from
+    void *SrcPtr,
     // [in] size in bytes of data to copy
     size_t Size,
     // [out][optional] optional recorded event for the enqueued operation
@@ -787,7 +787,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueKernelLaunch(
 ///         + `NULL == Device`
 ///     - ::OL_ERRC_INVALID_NULL_POINTER
 ///         + `NULL == ProgData`
-///         + `NULL == Queue`
+///         + `NULL == Program`
 OL_APIEXPORT ol_result_t OL_APICALL olCreateProgram(
     // [in] handle of the device
     ol_device_handle_t Device,
@@ -796,7 +796,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olCreateProgram(
     // [in] size of the program binary in bytes
     size_t ProgDataSize,
     // [out] output pointer for the created program
-    ol_program_handle_t *Queue);
+    ol_program_handle_t *Program);
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Increment the program's reference count
@@ -1082,8 +1082,8 @@ typedef struct ol_wait_event_params_t {
 /// @details Each entry is a pointer to the parameter passed to the function;
 typedef struct ol_enqueue_data_write_params_t {
   ol_queue_handle_t *pQueue;
-  void **pSrcPtr;
   void **pDstPtr;
+  void **pSrcPtr;
   size_t *pSize;
   ol_event_handle_t **pEventOut;
 } ol_enqueue_data_write_params_t;
@@ -1093,8 +1093,8 @@ typedef struct ol_enqueue_data_write_params_t {
 /// @details Each entry is a pointer to the parameter passed to the function;
 typedef struct ol_enqueue_data_read_params_t {
   ol_queue_handle_t *pQueue;
-  void **pSrcPtr;
   void **pDstPtr;
+  void **pSrcPtr;
   size_t *pSize;
   ol_event_handle_t **pEventOut;
 } ol_enqueue_data_read_params_t;
@@ -1104,9 +1104,9 @@ typedef struct ol_enqueue_data_read_params_t {
 /// @details Each entry is a pointer to the parameter passed to the function;
 typedef struct ol_enqueue_data_copy_params_t {
   ol_queue_handle_t *pQueue;
-  void **pSrcPtr;
-  void **pDstPtr;
   ol_device_handle_t *pDstDevice;
+  void **pDstPtr;
+  void **pSrcPtr;
   size_t *pSize;
   ol_event_handle_t **pEventOut;
 } ol_enqueue_data_copy_params_t;
@@ -1128,7 +1128,7 @@ typedef struct ol_create_program_params_t {
   ol_device_handle_t *pDevice;
   void **pProgData;
   size_t *pProgDataSize;
-  ol_program_handle_t **pQueue;
+  ol_program_handle_t **pProgram;
 } ol_create_program_params_t;
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1331,7 +1331,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olWaitEventWithCodeLoc(
 /// information
 /// @details See also ::olEnqueueDataWrite
 OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataWriteWithCodeLoc(
-    ol_queue_handle_t Queue, void *SrcPtr, void *DstPtr, size_t Size,
+    ol_queue_handle_t Queue, void *DstPtr, void *SrcPtr, size_t Size,
     ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1339,7 +1339,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataWriteWithCodeLoc(
 /// information
 /// @details See also ::olEnqueueDataRead
 OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataReadWithCodeLoc(
-    ol_queue_handle_t Queue, void *SrcPtr, void *DstPtr, size_t Size,
+    ol_queue_handle_t Queue, void *DstPtr, void *SrcPtr, size_t Size,
     ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1347,8 +1347,8 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataReadWithCodeLoc(
 /// information
 /// @details See also ::olEnqueueDataCopy
 OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataCopyWithCodeLoc(
-    ol_queue_handle_t Queue, void *SrcPtr, void *DstPtr,
-    ol_device_handle_t DstDevice, size_t Size, ol_event_handle_t *EventOut,
+    ol_queue_handle_t Queue, ol_device_handle_t DstDevice, void *DstPtr,
+    void *SrcPtr, size_t Size, ol_event_handle_t *EventOut,
     ol_code_location_t *CodeLocation);
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1366,7 +1366,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueKernelLaunchWithCodeLoc(
 /// @details See also ::olCreateProgram
 OL_APIEXPORT ol_result_t OL_APICALL olCreateProgramWithCodeLoc(
     ol_device_handle_t Device, void *ProgData, size_t ProgDataSize,
-    ol_program_handle_t *Queue, ol_code_location_t *CodeLocation);
+    ol_program_handle_t *Program, ol_code_location_t *CodeLocation);
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Variant of olRetainProgram that also sets source code location
diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc
index ffef36b5a9ac4..bd9641f74d1bb 100644
--- a/offload/liboffload/include/generated/OffloadEntryPoints.inc
+++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc
@@ -15,16 +15,16 @@ ol_impl_result_t olInit_val() {
 }
 OL_APIEXPORT ol_result_t OL_APICALL olInit() {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olInit";
+    std::cerr << "---> olInit";
   }
 
   ol_result_t Result = olInit_val();
 
   if (offloadConfig().TracingEnabled) {
-    std::cout << "()";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "()";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -46,16 +46,16 @@ ol_impl_result_t olShutDown_val() {
 }
 OL_APIEXPORT ol_result_t OL_APICALL olShutDown() {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olShutDown";
+    std::cerr << "---> olShutDown";
   }
 
   ol_result_t Result = olShutDown_val();
 
   if (offloadConfig().TracingEnabled) {
-    std::cout << "()";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "()";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -86,17 +86,17 @@ ol_impl_result_t olGetPlatform_val(uint32_t NumEntries,
 OL_APIEXPORT ol_result_t OL_APICALL
 olGetPlatform(uint32_t NumEntries, ol_platform_handle_t *Platforms) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olGetPlatform";
+    std::cerr << "---> olGetPlatform";
   }
 
   ol_result_t Result = olGetPlatform_val(NumEntries, Platforms);
 
   if (offloadConfig().TracingEnabled) {
     ol_get_platform_params_t Params = {&NumEntries, &Platforms};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -123,17 +123,17 @@ ol_impl_result_t olGetPlatformCount_val(uint32_t *NumPlatforms) {
 }
 OL_APIEXPORT ol_result_t OL_APICALL olGetPlatformCount(uint32_t *NumPlatforms) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olGetPlatformCount";
+    std::cerr << "---> olGetPlatformCount";
   }
 
   ol_result_t Result = olGetPlatformCount_val(NumPlatforms);
 
   if (offloadConfig().TracingEnabled) {
     ol_get_platform_count_params_t Params = {&NumPlatforms};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -171,7 +171,7 @@ OL_APIEXPORT ol_result_t OL_APICALL
 olGetPlatformInfo(ol_platform_handle_t Platform, ol_platform_info_t PropName,
                   size_t PropSize, void *PropValue) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olGetPlatformInfo";
+    std::cerr << "---> olGetPlatformInfo";
   }
 
   ol_result_t Result =
@@ -180,10 +180,10 @@ olGetPlatformInfo(ol_platform_handle_t Platform, ol_platform_info_t PropName,
   if (offloadConfig().TracingEnabled) {
     ol_get_platform_info_params_t Params = {&Platform, &PropName, &PropSize,
                                             &PropValue};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -220,7 +220,7 @@ OL_APIEXPORT ol_result_t OL_APICALL
 olGetPlatformInfoSize(ol_platform_handle_t Platform,
                       ol_platform_info_t PropName, size_t *PropSizeRet) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olGetPlatformInfoSize";
+    std::cerr << "---> olGetPlatformInfoSize";
   }
 
   ol_result_t Result =
@@ -229,10 +229,10 @@ olGetPlatformInfoSize(ol_platform_handle_t Platform,
   if (offloadConfig().TracingEnabled) {
     ol_get_platform_info_size_params_t Params = {&Platform, &PropName,
                                                  &PropSizeRet};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -266,17 +266,17 @@ ol_impl_result_t olGetDeviceCount_val(ol_platform_handle_t Platform,
 OL_APIEXPORT ol_result_t OL_APICALL
 olGetDeviceCount(ol_platform_handle_t Platform, uint32_t *NumDevices) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olGetDeviceCount";
+    std::cerr << "---> olGetDeviceCount";
   }
 
   ol_result_t Result = olGetDeviceCount_val(Platform, NumDevices);
 
   if (offloadConfig().TracingEnabled) {
     ol_get_device_count_params_t Params = {&Platform, &NumDevices};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -315,17 +315,17 @@ OL_APIEXPORT ol_result_t OL_APICALL olGetDevice(ol_platform_handle_t Platform,
                                                 uint32_t NumEntries,
                                                 ol_device_handle_t *Devices) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olGetDevice";
+    std::cerr << "---> olGetDevice";
   }
 
   ol_result_t Result = olGetDevice_val(Platform, NumEntries, Devices);
 
   if (offloadConfig().TracingEnabled) {
     ol_get_device_params_t Params = {&Platform, &NumEntries, &Devices};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -366,7 +366,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olGetDeviceInfo(ol_device_handle_t Device,
                                                     size_t PropSize,
                                                     void *PropValue) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olGetDeviceInfo";
+    std::cerr << "---> olGetDeviceInfo";
   }
 
   ol_result_t Result =
@@ -375,10 +375,10 @@ OL_APIEXPORT ol_result_t OL_APICALL olGetDeviceInfo(ol_device_handle_t Device,
   if (offloadConfig().TracingEnabled) {
     ol_get_device_info_params_t Params = {&Device, &PropName, &PropSize,
                                           &PropValue};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -413,7 +413,7 @@ ol_impl_result_t olGetDeviceInfoSize_val(ol_device_handle_t Device,
 OL_APIEXPORT ol_result_t OL_APICALL olGetDeviceInfoSize(
     ol_device_handle_t Device, ol_device_info_t PropName, size_t *PropSizeRet) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olGetDeviceInfoSize";
+    std::cerr << "---> olGetDeviceInfoSize";
   }
 
   ol_result_t Result = olGetDeviceInfoSize_val(Device, PropName, PropSizeRet);
@@ -421,10 +421,10 @@ OL_APIEXPORT ol_result_t OL_APICALL olGetDeviceInfoSize(
   if (offloadConfig().TracingEnabled) {
     ol_get_device_info_size_params_t Params = {&Device, &PropName,
                                                &PropSizeRet};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -464,17 +464,17 @@ OL_APIEXPORT ol_result_t OL_APICALL olMemAlloc(ol_device_handle_t Device,
                                                size_t Size,
                                                void **AllocationOut) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olMemAlloc";
+    std::cerr << "---> olMemAlloc";
   }
 
   ol_result_t Result = olMemAlloc_val(Device, Type, Size, AllocationOut);
 
   if (offloadConfig().TracingEnabled) {
     ol_mem_alloc_params_t Params = {&Device, &Type, &Size, &AllocationOut};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -509,17 +509,17 @@ OL_APIEXPORT ol_result_t OL_APICALL olMemFree(ol_device_handle_t Device,
                                               ol_alloc_type_t Type,
                                               void *Address) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olMemFree";
+    std::cerr << "---> olMemFree";
   }
 
   ol_result_t Result = olMemFree_val(Device, Type, Address);
 
   if (offloadConfig().TracingEnabled) {
     ol_mem_free_params_t Params = {&Device, &Type, &Address};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -552,17 +552,17 @@ ol_impl_result_t olCreateQueue_val(ol_device_handle_t Device,
 OL_APIEXPORT ol_result_t OL_APICALL olCreateQueue(ol_device_handle_t Device,
                                                   ol_queue_handle_t *Queue) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olCreateQueue";
+    std::cerr << "---> olCreateQueue";
   }
 
   ol_result_t Result = olCreateQueue_val(Device, Queue);
 
   if (offloadConfig().TracingEnabled) {
     ol_create_queue_params_t Params = {&Device, &Queue};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -589,17 +589,17 @@ ol_impl_result_t olRetainQueue_val(ol_queue_handle_t Queue) {
 }
 OL_APIEXPORT ol_result_t OL_APICALL olRetainQueue(ol_queue_handle_t Queue) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olRetainQueue";
+    std::cerr << "---> olRetainQueue";
   }
 
   ol_result_t Result = olRetainQueue_val(Queue);
 
   if (offloadConfig().TracingEnabled) {
     ol_retain_queue_params_t Params = {&Queue};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -625,17 +625,17 @@ ol_impl_result_t olReleaseQueue_val(ol_queue_handle_t Queue) {
 }
 OL_APIEXPORT ol_result_t OL_APICALL olReleaseQueue(ol_queue_handle_t Queue) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olReleaseQueue";
+    std::cerr << "---> olReleaseQueue";
   }
 
   ol_result_t Result = olReleaseQueue_val(Queue);
 
   if (offloadConfig().TracingEnabled) {
     ol_release_queue_params_t Params = {&Queue};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -661,17 +661,17 @@ ol_impl_result_t olFinishQueue_val(ol_queue_handle_t Queue) {
 }
 OL_APIEXPORT ol_result_t OL_APICALL olFinishQueue(ol_queue_handle_t Queue) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olFinishQueue";
+    std::cerr << "---> olFinishQueue";
   }
 
   ol_result_t Result = olFinishQueue_val(Queue);
 
   if (offloadConfig().TracingEnabled) {
     ol_finish_queue_params_t Params = {&Queue};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -697,17 +697,17 @@ ol_impl_result_t olRetainEvent_val(ol_event_handle_t Event) {
 }
 OL_APIEXPORT ol_result_t OL_APICALL olRetainEvent(ol_event_handle_t Event) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olRetainEvent";
+    std::cerr << "---> olRetainEvent";
   }
 
   ol_result_t Result = olRetainEvent_val(Event);
 
   if (offloadConfig().TracingEnabled) {
     ol_retain_event_params_t Params = {&Event};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -733,17 +733,17 @@ ol_impl_result_t olReleaseEvent_val(ol_event_handle_t Event) {
 }
 OL_APIEXPORT ol_result_t OL_APICALL olReleaseEvent(ol_event_handle_t Event) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olReleaseEvent";
+    std::cerr << "---> olReleaseEvent";
   }
 
   ol_result_t Result = olReleaseEvent_val(Event);
 
   if (offloadConfig().TracingEnabled) {
     ol_release_event_params_t Params = {&Event};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -769,17 +769,17 @@ ol_impl_result_t olWaitEvent_val(ol_event_handle_t Event) {
 }
 OL_APIEXPORT ol_result_t OL_APICALL olWaitEvent(ol_event_handle_t Event) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olWaitEvent";
+    std::cerr << "---> olWaitEvent";
   }
 
   ol_result_t Result = olWaitEvent_val(Event);
 
   if (offloadConfig().TracingEnabled) {
     ol_wait_event_params_t Params = {&Event};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -794,8 +794,8 @@ ol_result_t olWaitEventWithCodeLoc(ol_event_handle_t Event,
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-ol_impl_result_t olEnqueueDataWrite_val(ol_queue_handle_t Queue, void *SrcPtr,
-                                        void *DstPtr, size_t Size,
+ol_impl_result_t olEnqueueDataWrite_val(ol_queue_handle_t Queue, void *DstPtr,
+                                        void *SrcPtr, size_t Size,
                                         ol_event_handle_t *EventOut) {
   if (true /*enableParameterValidation*/) {
     if (Size == 0) {
@@ -806,107 +806,106 @@ ol_impl_result_t olEnqueueDataWrite_val(ol_queue_handle_t Queue, void *SrcPtr,
       return OL_ERRC_INVALID_NULL_HANDLE;
     }
 
-    if (NULL == SrcPtr) {
+    if (NULL == DstPtr) {
       return OL_ERRC_INVALID_NULL_POINTER;
     }
 
-    if (NULL == DstPtr) {
+    if (NULL == SrcPtr) {
       return OL_ERRC_INVALID_NULL_POINTER;
     }
   }
 
-  return olEnqueueDataWrite_impl(Queue, SrcPtr, DstPtr, Size, EventOut);
+  return olEnqueueDataWrite_impl(Queue, DstPtr, SrcPtr, Size, EventOut);
 }
 OL_APIEXPORT ol_result_t OL_APICALL
-olEnqueueDataWrite(ol_queue_handle_t Queue, void *SrcPtr, void *DstPtr,
+olEnqueueDataWrite(ol_queue_handle_t Queue, void *DstPtr, void *SrcPtr,
                    size_t Size, ol_event_handle_t *EventOut) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olEnqueueDataWrite";
+    std::cerr << "---> olEnqueueDataWrite";
   }
 
   ol_result_t Result =
-      olEnqueueDataWrite_val(Queue, SrcPtr, DstPtr, Size, EventOut);
+      olEnqueueDataWrite_val(Queue, DstPtr, SrcPtr, Size, EventOut);
 
   if (offloadConfig().TracingEnabled) {
-    ol_enqueue_data_write_params_t Params = {&Queue, &SrcPtr, &DstPtr, &Size,
+    ol_enqueue_data_write_params_t Params = {&Queue, &DstPtr, &SrcPtr, &Size,
                                              &EventOut};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
 }
-ol_result_t olEnqueueDataWriteWithCodeLoc(ol_queue_handle_t Queue, void *SrcPtr,
-                                          void *DstPtr, size_t Size,
+ol_result_t olEnqueueDataWriteWithCodeLoc(ol_queue_handle_t Queue, void *DstPtr,
+                                          void *SrcPtr, size_t Size,
                                           ol_event_handle_t *EventOut,
                                           ol_code_location_t *CodeLocation) {
   currentCodeLocation() = CodeLocation;
   ol_result_t Result =
-      olEnqueueDataWrite(Queue, SrcPtr, DstPtr, Size, EventOut);
+      olEnqueueDataWrite(Queue, DstPtr, SrcPtr, Size, EventOut);
 
   currentCodeLocation() = nullptr;
   return Result;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-ol_impl_result_t olEnqueueDataRead_val(ol_queue_handle_t Queue, void *SrcPtr,
-                                       void *DstPtr, size_t Size,
+ol_impl_result_t olEnqueueDataRead_val(ol_queue_handle_t Queue, void *DstPtr,
+                                       void *SrcPtr, size_t Size,
                                        ol_event_handle_t *EventOut) {
   if (true /*enableParameterValidation*/) {
     if (NULL == Queue) {
       return OL_ERRC_INVALID_NULL_HANDLE;
     }
 
-    if (NULL == SrcPtr) {
+    if (NULL == DstPtr) {
       return OL_ERRC_INVALID_NULL_POINTER;
     }
 
-    if (NULL == DstPtr) {
+    if (NULL == SrcPtr) {
       return OL_ERRC_INVALID_NULL_POINTER;
     }
   }
 
-  return olEnqueueDataRead_impl(Queue, SrcPtr, DstPtr, Size, EventOut);
+  return olEnqueueDataRead_impl(Queue, DstPtr, SrcPtr, Size, EventOut);
 }
 OL_APIEXPORT ol_result_t OL_APICALL
-olEnqueueDataRead(ol_queue_handle_t Queue, void *SrcPtr, void *DstPtr,
+olEnqueueDataRead(ol_queue_handle_t Queue, void *DstPtr, void *SrcPtr,
                   size_t Size, ol_event_handle_t *EventOut) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olEnqueueDataRead";
+    std::cerr << "---> olEnqueueDataRead";
   }
 
   ol_result_t Result =
-      olEnqueueDataRead_val(Queue, SrcPtr, DstPtr, Size, EventOut);
+      olEnqueueDataRead_val(Queue, DstPtr, SrcPtr, Size, EventOut);
 
   if (offloadConfig().TracingEnabled) {
-    ol_enqueue_data_read_params_t Params = {&Queue, &SrcPtr, &DstPtr, &Size,
+    ol_enqueue_data_read_params_t Params = {&Queue, &DstPtr, &SrcPtr, &Size,
                                             &EventOut};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
 }
-ol_result_t olEnqueueDataReadWithCodeLoc(ol_queue_handle_t Queue, void *SrcPtr,
-                                         void *DstPtr, size_t Size,
+ol_result_t olEnqueueDataReadWithCodeLoc(ol_queue_handle_t Queue, void *DstPtr,
+                                         void *SrcPtr, size_t Size,
                                          ol_event_handle_t *EventOut,
                                          ol_code_location_t *CodeLocation) {
   currentCodeLocation() = CodeLocation;
-  ol_result_t Result = olEnqueueDataRead(Queue, SrcPtr, DstPtr, Size, EventOut);
+  ol_result_t Result = olEnqueueDataRead(Queue, DstPtr, SrcPtr, Size, EventOut);
 
   currentCodeLocation() = nullptr;
   return Result;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-ol_impl_result_t olEnqueueDataCopy_val(ol_queue_handle_t Queue, void *SrcPtr,
-                                       void *DstPtr,
+ol_impl_result_t olEnqueueDataCopy_val(ol_queue_handle_t Queue,
                                        ol_device_handle_t DstDevice,
-                                       size_t Size,
+                                       void *DstPtr, void *SrcPtr, size_t Size,
                                        ol_event_handle_t *EventOut) {
   if (true /*enableParameterValidation*/) {
     if (NULL == Queue) {
@@ -917,48 +916,48 @@ ol_impl_result_t olEnqueueDataCopy_val(ol_queue_handle_t Queue, void *SrcPtr,
       return OL_ERRC_INVALID_NULL_HANDLE;
     }
 
-    if (NULL == SrcPtr) {
+    if (NULL == DstPtr) {
       return OL_ERRC_INVALID_NULL_POINTER;
     }
 
-    if (NULL == DstPtr) {
+    if (NULL == SrcPtr) {
       return OL_ERRC_INVALID_NULL_POINTER;
     }
   }
 
-  return olEnqueueDataCopy_impl(Queue, SrcPtr, DstPtr, DstDevice, Size,
+  return olEnqueueDataCopy_impl(Queue, DstDevice, DstPtr, SrcPtr, Size,
                                 EventOut);
 }
 OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataCopy(
-    ol_queue_handle_t Queue, void *SrcPtr, void *DstPtr,
-    ol_device_handle_t DstDevice, size_t Size, ol_event_handle_t *EventOut) {
+    ol_queue_handle_t Queue, ol_device_handle_t DstDevice, void *DstPtr,
+    void *SrcPtr, size_t Size, ol_event_handle_t *EventOut) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olEnqueueDataCopy";
+    std::cerr << "---> olEnqueueDataCopy";
   }
 
   ol_result_t Result =
-      olEnqueueDataCopy_val(Queue, SrcPtr, DstPtr, DstDevice, Size, EventOut);
+      olEnqueueDataCopy_val(Queue, DstDevice, DstPtr, SrcPtr, Size, EventOut);
 
   if (offloadConfig().TracingEnabled) {
-    ol_enqueue_data_copy_params_t Params = {&Queue,     &SrcPtr, &DstPtr,
-                                            &DstDevice, &Size,   &EventOut};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    ol_enqueue_data_copy_params_t Params = {&Queue,  &DstDevice, &DstPtr,
+                                            &SrcPtr, &Size,      &EventOut};
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
 }
-ol_result_t olEnqueueDataCopyWithCodeLoc(ol_queue_handle_t Queue, void *SrcPtr,
-                                         void *DstPtr,
+ol_result_t olEnqueueDataCopyWithCodeLoc(ol_queue_handle_t Queue,
                                          ol_device_handle_t DstDevice,
+                                         void *DstPtr, void *SrcPtr,
                                          size_t Size,
                                          ol_event_handle_t *EventOut,
                                          ol_code_location_t *CodeLocation) {
   currentCodeLocation() = CodeLocation;
   ol_result_t Result =
-      olEnqueueDataCopy(Queue, SrcPtr, DstPtr, DstDevice, Size, EventOut);
+      olEnqueueDataCopy(Queue, DstDevice, DstPtr, SrcPtr, Size, EventOut);
 
   currentCodeLocation() = nullptr;
   return Result;
@@ -990,7 +989,7 @@ olEnqueueKernelLaunch(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
                       const ol_kernel_launch_size_args_t *LaunchSizeArgs,
                       ol_event_handle_t *EventOut) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olEnqueueKernelLaunch";
+    std::cerr << "---> olEnqueueKernelLaunch";
   }
 
   ol_result_t Result =
@@ -999,10 +998,10 @@ olEnqueueKernelLaunch(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
   if (offloadConfig().TracingEnabled) {
     ol_enqueue_kernel_launch_params_t Params = {&Queue, &Kernel,
                                                 &LaunchSizeArgs, &EventOut};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -1022,7 +1021,7 @@ ol_result_t olEnqueueKernelLaunchWithCodeLoc(
 ///////////////////////////////////////////////////////////////////////////////
 ol_impl_result_t olCreateProgram_val(ol_device_handle_t Device, void *ProgData,
                                      size_t ProgDataSize,
-                                     ol_program_handle_t *Queue) {
+                                     ol_program_handle_t *Program) {
   if (true /*enableParameterValidation*/) {
     if (NULL == Device) {
       return OL_ERRC_INVALID_NULL_HANDLE;
@@ -1032,40 +1031,40 @@ ol_impl_result_t olCreateProgram_val(ol_device_handle_t Device, void *ProgData,
       return OL_ERRC_INVALID_NULL_POINTER;
     }
 
-    if (NULL == Queue) {
+    if (NULL == Program) {
       return OL_ERRC_INVALID_NULL_POINTER;
     }
   }
 
-  return olCreateProgram_impl(Device, ProgData, ProgDataSize, Queue);
+  return olCreateProgram_impl(Device, ProgData, ProgDataSize, Program);
 }
 OL_APIEXPORT ol_result_t OL_APICALL
 olCreateProgram(ol_device_handle_t Device, void *ProgData, size_t ProgDataSize,
-                ol_program_handle_t *Queue) {
+                ol_program_handle_t *Program) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olCreateProgram";
+    std::cerr << "---> olCreateProgram";
   }
 
   ol_result_t Result =
-      olCreateProgram_val(Device, ProgData, ProgDataSize, Queue);
+      olCreateProgram_val(Device, ProgData, ProgDataSize, Program);
 
   if (offloadConfig().TracingEnabled) {
     ol_create_program_params_t Params = {&Device, &ProgData, &ProgDataSize,
-                                         &Queue};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+                                         &Program};
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
 }
 ol_result_t olCreateProgramWithCodeLoc(ol_device_handle_t Device,
                                        void *ProgData, size_t ProgDataSize,
-                                       ol_program_handle_t *Queue,
+                                       ol_program_handle_t *Program,
                                        ol_code_location_t *CodeLocation) {
   currentCodeLocation() = CodeLocation;
-  ol_result_t Result = olCreateProgram(Device, ProgData, ProgDataSize, Queue);
+  ol_result_t Result = olCreateProgram(Device, ProgData, ProgDataSize, Program);
 
   currentCodeLocation() = nullptr;
   return Result;
@@ -1084,17 +1083,17 @@ ol_impl_result_t olRetainProgram_val(ol_program_handle_t Program) {
 OL_APIEXPORT ol_result_t OL_APICALL
 olRetainProgram(ol_program_handle_t Program) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olRetainProgram";
+    std::cerr << "---> olRetainProgram";
   }
 
   ol_result_t Result = olRetainProgram_val(Program);
 
   if (offloadConfig().TracingEnabled) {
     ol_retain_program_params_t Params = {&Program};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -1121,17 +1120,17 @@ ol_impl_result_t olReleaseProgram_val(ol_program_handle_t Program) {
 OL_APIEXPORT ol_result_t OL_APICALL
 olReleaseProgram(ol_program_handle_t Program) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olReleaseProgram";
+    std::cerr << "---> olReleaseProgram";
   }
 
   ol_result_t Result = olReleaseProgram_val(Program);
 
   if (offloadConfig().TracingEnabled) {
     ol_release_program_params_t Params = {&Program};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -1169,17 +1168,17 @@ OL_APIEXPORT ol_result_t OL_APICALL olCreateKernel(ol_program_handle_t Program,
                                                    const char *KernelName,
                                                    ol_kernel_handle_t *Kernel) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olCreateKernel";
+    std::cerr << "---> olCreateKernel";
   }
 
   ol_result_t Result = olCreateKernel_val(Program, KernelName, Kernel);
 
   if (offloadConfig().TracingEnabled) {
     ol_create_kernel_params_t Params = {&Program, &KernelName, &Kernel};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -1207,17 +1206,17 @@ ol_impl_result_t olRetainKernel_val(ol_kernel_handle_t Kernel) {
 }
 OL_APIEXPORT ol_result_t OL_APICALL olRetainKernel(ol_kernel_handle_t Kernel) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olRetainKernel";
+    std::cerr << "---> olRetainKernel";
   }
 
   ol_result_t Result = olRetainKernel_val(Kernel);
 
   if (offloadConfig().TracingEnabled) {
     ol_retain_kernel_params_t Params = {&Kernel};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -1243,17 +1242,17 @@ ol_impl_result_t olReleaseKernel_val(ol_kernel_handle_t Kernel) {
 }
 OL_APIEXPORT ol_result_t OL_APICALL olReleaseKernel(ol_kernel_handle_t Kernel) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olReleaseKernel";
+    std::cerr << "---> olReleaseKernel";
   }
 
   ol_result_t Result = olReleaseKernel_val(Kernel);
 
   if (offloadConfig().TracingEnabled) {
     ol_release_kernel_params_t Params = {&Kernel};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -1286,7 +1285,7 @@ ol_impl_result_t olSetKernelArgValue_val(ol_kernel_handle_t Kernel,
 OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgValue(
     ol_kernel_handle_t Kernel, uint32_t Index, size_t Size, void *ArgData) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olSetKernelArgValue";
+    std::cerr << "---> olSetKernelArgValue";
   }
 
   ol_result_t Result = olSetKernelArgValue_val(Kernel, Index, Size, ArgData);
@@ -1294,10 +1293,10 @@ OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgValue(
   if (offloadConfig().TracingEnabled) {
     ol_set_kernel_arg_value_params_t Params = {&Kernel, &Index, &Size,
                                                &ArgData};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
@@ -1331,7 +1330,7 @@ ol_impl_result_t olSetKernelArgsData_val(ol_kernel_handle_t Kernel,
 OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgsData(
     ol_kernel_handle_t Kernel, void *ArgsData, size_t ArgsDataSize) {
   if (offloadConfig().TracingEnabled) {
-    std::cout << "---> olSetKernelArgsData";
+    std::cerr << "---> olSetKernelArgsData";
   }
 
   ol_result_t Result = olSetKernelArgsData_val(Kernel, ArgsData, ArgsDataSize);
@@ -1339,10 +1338,10 @@ OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgsData(
   if (offloadConfig().TracingEnabled) {
     ol_set_kernel_args_data_params_t Params = {&Kernel, &ArgsData,
                                                &ArgsDataSize};
-    std::cout << "(" << &Params << ")";
-    std::cout << "-> " << Result << "\n";
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
-      std::cout << "     *Error Details* " << Result->Details << " \n";
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
     }
   }
   return Result;
diff --git a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
index e7179e44fc9ec..976422f3d7fd5 100644
--- a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
+++ b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
@@ -59,18 +59,17 @@ ol_impl_result_t olReleaseEvent_impl(ol_event_handle_t Event);
 
 ol_impl_result_t olWaitEvent_impl(ol_event_handle_t Event);
 
-ol_impl_result_t olEnqueueDataWrite_impl(ol_queue_handle_t Queue, void *SrcPtr,
-                                         void *DstPtr, size_t Size,
+ol_impl_result_t olEnqueueDataWrite_impl(ol_queue_handle_t Queue, void *DstPtr,
+                                         void *SrcPtr, size_t Size,
                                          ol_event_handle_t *EventOut);
 
-ol_impl_result_t olEnqueueDataRead_impl(ol_queue_handle_t Queue, void *SrcPtr,
-                                        void *DstPtr, size_t Size,
+ol_impl_result_t olEnqueueDataRead_impl(ol_queue_handle_t Queue, void *DstPtr,
+                                        void *SrcPtr, size_t Size,
                                         ol_event_handle_t *EventOut);
 
-ol_impl_result_t olEnqueueDataCopy_impl(ol_queue_handle_t Queue, void *SrcPtr,
-                                        void *DstPtr,
+ol_impl_result_t olEnqueueDataCopy_impl(ol_queue_handle_t Queue,
                                         ol_device_handle_t DstDevice,
-                                        size_t Size,
+                                        void *DstPtr, void *SrcPtr, size_t Size,
                                         ol_event_handle_t *EventOut);
 
 ol_impl_result_t
@@ -80,7 +79,7 @@ olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
 
 ol_impl_result_t olCreateProgram_impl(ol_device_handle_t Device, void *ProgData,
                                       size_t ProgDataSize,
-                                      ol_program_handle_t *Queue);
+                                      ol_program_handle_t *Program);
 
 ol_impl_result_t olRetainProgram_impl(ol_program_handle_t Program);
 
diff --git a/offload/liboffload/include/generated/OffloadPrint.hpp b/offload/liboffload/include/generated/OffloadPrint.hpp
index 56ab655a4ae74..10dc58d1079a0 100644
--- a/offload/liboffload/include/generated/OffloadPrint.hpp
+++ b/offload/liboffload/include/generated/OffloadPrint.hpp
@@ -562,12 +562,12 @@ operator<<(std::ostream &os,
   os << ".Queue = ";
   printPtr(os, *params->pQueue);
   os << ", ";
-  os << ".SrcPtr = ";
-  printPtr(os, *params->pSrcPtr);
-  os << ", ";
   os << ".DstPtr = ";
   printPtr(os, *params->pDstPtr);
   os << ", ";
+  os << ".SrcPtr = ";
+  printPtr(os, *params->pSrcPtr);
+  os << ", ";
   os << ".Size = ";
   os << *params->pSize;
   os << ", ";
@@ -582,12 +582,12 @@ operator<<(std::ostream &os,
   os << ".Queue = ";
   printPtr(os, *params->pQueue);
   os << ", ";
-  os << ".SrcPtr = ";
-  printPtr(os, *params->pSrcPtr);
-  os << ", ";
   os << ".DstPtr = ";
   printPtr(os, *params->pDstPtr);
   os << ", ";
+  os << ".SrcPtr = ";
+  printPtr(os, *params->pSrcPtr);
+  os << ", ";
   os << ".Size = ";
   os << *params->pSize;
   os << ", ";
@@ -602,14 +602,14 @@ operator<<(std::ostream &os,
   os << ".Queue = ";
   printPtr(os, *params->pQueue);
   os << ", ";
-  os << ".SrcPtr = ";
-  printPtr(os, *params->pSrcPtr);
+  os << ".DstDevice = ";
+  printPtr(os, *params->pDstDevice);
   os << ", ";
   os << ".DstPtr = ";
   printPtr(os, *params->pDstPtr);
   os << ", ";
-  os << ".DstDevice = ";
-  printPtr(os, *params->pDstDevice);
+  os << ".SrcPtr = ";
+  printPtr(os, *params->pSrcPtr);
   os << ", ";
   os << ".Size = ";
   os << *params->pSize;
@@ -647,8 +647,8 @@ operator<<(std::ostream &os, const struct ol_create_program_params_t *params) {
   os << ".ProgDataSize = ";
   os << *params->pProgDataSize;
   os << ", ";
-  os << ".Queue = ";
-  printPtr(os, *params->pQueue);
+  os << ".Program = ";
+  printPtr(os, *params->pProgram);
   return os;
 }
 
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 89cc42823261f..f6c3230558092 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -425,8 +425,8 @@ ol_event_handle_t makeEvent(ol_queue_handle_t Queue) {
   return EventImpl.release();
 }
 
-ol_impl_result_t olEnqueueDataWrite_impl(ol_queue_handle_t Queue, void *SrcPtr,
-                                         void *DstPtr, size_t Size,
+ol_impl_result_t olEnqueueDataWrite_impl(ol_queue_handle_t Queue, void *DstPtr,
+                                         void *SrcPtr, size_t Size,
                                          ol_event_handle_t *EventOut) {
   auto &DeviceImpl = Queue->Device->Device;
 
@@ -441,8 +441,8 @@ ol_impl_result_t olEnqueueDataWrite_impl(ol_queue_handle_t Queue, void *SrcPtr,
   return OL_SUCCESS;
 }
 
-ol_impl_result_t olEnqueueDataRead_impl(ol_queue_handle_t Queue, void *SrcPtr,
-                                        void *DstPtr, size_t Size,
+ol_impl_result_t olEnqueueDataRead_impl(ol_queue_handle_t Queue, void *DstPtr,
+                                        void *SrcPtr, size_t Size,
                                         ol_event_handle_t *EventOut) {
   auto &DeviceImpl = Queue->Device->Device;
 
@@ -457,10 +457,9 @@ ol_impl_result_t olEnqueueDataRead_impl(ol_queue_handle_t Queue, void *SrcPtr,
   return OL_SUCCESS;
 }
 
-ol_impl_result_t olEnqueueDataCopy_impl(ol_queue_handle_t Queue, void *SrcPtr,
-                                        void *DstPtr,
+ol_impl_result_t olEnqueueDataCopy_impl(ol_queue_handle_t Queue,
                                         ol_device_handle_t DstDevice,
-                                        size_t Size,
+                                        void *DstPtr, void *SrcPtr, size_t Size,
                                         ol_event_handle_t *EventOut) {
   auto &DeviceImpl = Queue->Device->Device;
 
diff --git a/offload/tools/offload-tblgen/EntryPointGen.cpp b/offload/tools/offload-tblgen/EntryPointGen.cpp
index 990ff96a3121d..36fc5c3eb1c2a 100644
--- a/offload/tools/offload-tblgen/EntryPointGen.cpp
+++ b/offload/tools/offload-tblgen/EntryPointGen.cpp
@@ -72,7 +72,7 @@ static void EmitEntryPointFunc(const FunctionRec &F, raw_ostream &OS) {
 
   // Emit pre-call prints
   OS << TAB_1 "if (offloadConfig().TracingEnabled) {\n";
-  OS << formatv(TAB_2 "std::cout << \"---> {0}\";\n", F.getName());
+  OS << formatv(TAB_2 "std::cerr << \"---> {0}\";\n", F.getName());
   OS << TAB_1 "}\n\n";
 
   // Perform actual function call to the validation wrapper
@@ -91,13 +91,13 @@ static void EmitEntryPointFunc(const FunctionRec &F, raw_ostream &OS) {
       }
     }
     OS << formatv("};\n");
-    OS << TAB_2 "std::cout << \"(\" << &Params << \")\";\n";
+    OS << TAB_2 "std::cerr << \"(\" << &Params << \")\";\n";
   } else {
-    OS << TAB_2 "std::cout << \"()\";\n";
+    OS << TAB_2 "std::cerr << \"()\";\n";
   }
-  OS << TAB_2 "std::cout << \"-> \" << Result << \"\\n\";\n";
+  OS << TAB_2 "std::cerr << \"-> \" << Result << \"\\n\";\n";
   OS << TAB_2 "if (Result && Result->Details) {\n";
-  OS << TAB_3 "std::cout << \"     *Error Details* \" << Result->Details "
+  OS << TAB_3 "std::cerr << \"     *Error Details* \" << Result->Details "
               "<< \" \\n\";\n";
   OS << TAB_2 "}\n";
   OS << TAB_1 "}\n";
diff --git a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp
index d15e738bc94e6..dc2791266fa14 100644
--- a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp
+++ b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp
@@ -22,11 +22,11 @@ TEST_F(olEnqueueDataCopyTest, Success) {
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &AllocA));
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &AllocB));
   ASSERT_SUCCESS(
-      olEnqueueDataWrite(Queue, Input.data(), AllocA, Size, nullptr));
+      olEnqueueDataWrite(Queue, AllocA, Input.data(), Size, nullptr));
   ASSERT_SUCCESS(
-      olEnqueueDataCopy(Queue, AllocA, AllocB, Device, Size, nullptr));
+      olEnqueueDataCopy(Queue, Device, AllocB, AllocA, Size, nullptr));
   ASSERT_SUCCESS(
-      olEnqueueDataRead(Queue, AllocB, Output.data(), Size, nullptr));
+      olEnqueueDataRead(Queue, Output.data(), AllocB, Size, nullptr));
   ASSERT_SUCCESS(olFinishQueue(Queue));
   for (uint8_t Val : Output) {
     ASSERT_EQ(Val, 42);
diff --git a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp
index 5787889c4febb..71323e4b44817 100644
--- a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp
+++ b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp
@@ -19,8 +19,8 @@ TEST_F(olEnqueueDataReadTest, Success) {
   std::vector<uint8_t> Output(Size, 0);
 
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &Alloc));
-  ASSERT_SUCCESS(olEnqueueDataWrite(Queue, Input.data(), Alloc, Size, nullptr));
-  ASSERT_SUCCESS(olEnqueueDataRead(Queue, Alloc, Output.data(), Size, nullptr));
+  ASSERT_SUCCESS(olEnqueueDataWrite(Queue, Alloc, Input.data(), Size, nullptr));
+  ASSERT_SUCCESS(olEnqueueDataRead(Queue, Output.data(), Alloc, Size, nullptr));
   ASSERT_SUCCESS(olFinishQueue(Queue));
   for (uint8_t Val : Output) {
     ASSERT_EQ(Val, 42);
diff --git a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp
index d3f3edf58a531..f60d501a9918d 100644
--- a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp
+++ b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp
@@ -17,7 +17,7 @@ TEST_F(olEnqueueDataWriteTest, Success) {
   void *Alloc;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &Alloc));
   std::vector<uint8_t> Input(Size, 42);
-  ASSERT_SUCCESS(olEnqueueDataWrite(Queue, Input.data(), Alloc, Size, nullptr));
+  ASSERT_SUCCESS(olEnqueueDataWrite(Queue, Alloc, Input.data(), Size, nullptr));
   olFinishQueue(Queue);
   olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc);
 }

>From 3fbdf61727c9e803134b54863af8dca009bde5b8 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Tue, 11 Feb 2025 17:17:33 +0000
Subject: [PATCH 15/17] Formatting

---
 offload/liboffload/src/OffloadImpl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index f6c3230558092..d395eb34a5f4a 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -130,7 +130,7 @@ void initPlugins() {
   // Attempt to create an instance of each supported plugin.
 #define PLUGIN_TARGET(Name)                                                    \
   do {                                                                         \
-    Platforms().emplace_back(ol_platform_impl_t{                            \
+    Platforms().emplace_back(ol_platform_impl_t{                               \
         std::unique_ptr<GenericPluginTy>(createPlugin_##Name()), {}});         \
   } while (false);
 #include "Shared/Targets.def"

>From 0ca7527f782c53f7f33cc9fda401fabea0105a0a Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Thu, 13 Feb 2025 16:47:19 +0000
Subject: [PATCH 16/17] Alternative memcpy implementation

---
 offload/liboffload/API/Device.td              |  12 +
 offload/liboffload/API/Enqueue.td             |  27 ++-
 .../liboffload/include/generated/OffloadAPI.h | 132 +++++++++--
 .../include/generated/OffloadEntryPoints.inc  | 207 ++++++++++++++----
 .../include/generated/OffloadFuncs.inc        |  16 +-
 .../generated/OffloadImplFuncDecls.inc        |  33 ++-
 .../include/generated/OffloadPrint.hpp        |  38 +++-
 offload/liboffload/src/OffloadImpl.cpp        | 114 +++++++---
 offload/unittests/OffloadAPI/CMakeLists.txt   |   1 +
 .../OffloadAPI/enqueue/olEnqueueDataCopy.cpp  |  12 +-
 .../OffloadAPI/enqueue/olEnqueueDataRead.cpp  |  12 +-
 .../OffloadAPI/enqueue/olEnqueueDataWrite.cpp |   9 +-
 .../OffloadAPI/enqueue/olEnqueueMemcpy.cpp    |  71 ++++++
 13 files changed, 542 insertions(+), 142 deletions(-)
 create mode 100644 offload/unittests/OffloadAPI/enqueue/olEnqueueMemcpy.cpp

diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td
index 30c0b71fe7b37..ee1ca38b48a07 100644
--- a/offload/liboffload/API/Device.td
+++ b/offload/liboffload/API/Device.td
@@ -104,3 +104,15 @@ def : Function {
     Return<"OL_ERRC_INVALID_DEVICE">
   ];
 }
+
+def : Function {
+  let name = "olGetHostDevice";
+  let desc = "Return the special host device used to represent the host in memory transfer operations";
+  let details = [
+    "The host device does not support queues"
+  ];
+  let params = [
+    Param<"ol_device_handle_t*", "Device", "Output pointer for the device">
+  ]; // TODO: Take a platform?
+  let returns = [];
+}
diff --git a/offload/liboffload/API/Enqueue.td b/offload/liboffload/API/Enqueue.td
index 695b157ac1de3..723d87cfb2d29 100644
--- a/offload/liboffload/API/Enqueue.td
+++ b/offload/liboffload/API/Enqueue.td
@@ -11,7 +11,28 @@
 //===----------------------------------------------------------------------===//
 
 def : Function {
-    let name = "olEnqueueDataWrite";
+    let name = "olEnqueueMemcpy";
+    let desc = "Enqueue a memcpy operation.";
+    let details = [
+        "For host pointers, use the device returned by olGetHostDevice",
+        "At least one device must be a non-host device"
+    ];
+    let params = [
+        Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
+        Param<"void*", "DstPtr", "pointer to copy to", PARAM_IN>,
+        Param<"ol_device_handle_t", "DstDevice", "device that DstPtr belongs to", PARAM_IN>,
+        Param<"void*", "SrcPtr", "pointer to copy from", PARAM_IN>,
+        Param<"ol_device_handle_t", "SrcDevice", "device that SrcPtr belongs to", PARAM_IN>,
+        Param<"size_t", "Size", "size in bytes of data to copy", PARAM_IN>,
+        Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
+    ];
+    let returns = [
+        Return<"OL_ERRC_INVALID_SIZE", ["`Size == 0`"]>
+    ];
+}
+
+def : Function {
+    let name = "olEnqueueMemcpyHtoD";
     let desc = "Enqueue a write operation from host to device memory";
     let details = [];
     let params = [
@@ -27,7 +48,7 @@ def : Function {
 }
 
 def : Function {
-    let name = "olEnqueueDataRead";
+    let name = "olEnqueueMemcpyDtoH";
     let desc = "Enqueue a read operation from device to host memory";
     let details = [];
     let params = [
@@ -41,7 +62,7 @@ def : Function {
 }
 
 def : Function {
-    let name = "olEnqueueDataCopy";
+    let name = "olEnqueueMemcpyDtoD";
     let desc = "Enqueue a write operation between device allocations";
     let details = [];
     let params = [
diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index 110d252fe45a7..dd301f564a283 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -474,6 +474,24 @@ OL_APIEXPORT ol_result_t OL_APICALL olGetDeviceInfoSize(
     // [out] pointer to the number of bytes required to store the query
     size_t *PropSizeRet);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Return the special host device used to represent the host in memory
+/// transfer operations
+///
+/// @details
+///    - The host device does not support queues
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == Device`
+OL_APIEXPORT ol_result_t OL_APICALL olGetHostDevice(
+    //  Output pointer for the device
+    ol_device_handle_t *Device);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Represents the type of allocation made with olMemAlloc
 typedef enum ol_alloc_type_t {
@@ -653,6 +671,42 @@ OL_APIEXPORT ol_result_t OL_APICALL olWaitEvent(
     // [in] handle of the event
     ol_event_handle_t Event);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a memcpy operation.
+///
+/// @details
+///    - For host pointers, use the device returned by olGetHostDevice
+///    - At least one device must be a non-host device
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_SIZE
+///         + `Size == 0`
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Queue`
+///         + `NULL == DstDevice`
+///         + `NULL == SrcDevice`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == DstPtr`
+///         + `NULL == SrcPtr`
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueMemcpy(
+    // [in] handle of the queue
+    ol_queue_handle_t Queue,
+    // [in] pointer to copy to
+    void *DstPtr,
+    // [in] device that DstPtr belongs to
+    ol_device_handle_t DstDevice,
+    // [in] pointer to copy from
+    void *SrcPtr,
+    // [in] device that SrcPtr belongs to
+    ol_device_handle_t SrcDevice,
+    // [in] size in bytes of data to copy
+    size_t Size,
+    // [out][optional] optional recorded event for the enqueued operation
+    ol_event_handle_t *EventOut);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Enqueue a write operation from host to device memory
 ///
@@ -669,7 +723,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olWaitEvent(
 ///     - ::OL_ERRC_INVALID_NULL_POINTER
 ///         + `NULL == DstPtr`
 ///         + `NULL == SrcPtr`
-OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataWrite(
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueMemcpyHtoD(
     // [in] handle of the queue
     ol_queue_handle_t Queue,
     // [in] device pointer to copy to
@@ -695,7 +749,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataWrite(
 ///     - ::OL_ERRC_INVALID_NULL_POINTER
 ///         + `NULL == DstPtr`
 ///         + `NULL == SrcPtr`
-OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataRead(
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueMemcpyDtoH(
     // [in] handle of the queue
     ol_queue_handle_t Queue,
     // [in] host pointer to copy to
@@ -722,7 +776,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataRead(
 ///     - ::OL_ERRC_INVALID_NULL_POINTER
 ///         + `NULL == DstPtr`
 ///         + `NULL == SrcPtr`
-OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataCopy(
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueMemcpyDtoD(
     // [in] handle of the queue
     ol_queue_handle_t Queue,
     // [in] device that the destination pointer is resident on
@@ -1008,6 +1062,13 @@ typedef struct ol_get_device_info_size_params_t {
   size_t **pPropSizeRet;
 } ol_get_device_info_size_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olGetHostDevice
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_get_host_device_params_t {
+  ol_device_handle_t **pDevice;
+} ol_get_host_device_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for olMemAlloc
 /// @details Each entry is a pointer to the parameter passed to the function;
@@ -1078,38 +1139,51 @@ typedef struct ol_wait_event_params_t {
 } ol_wait_event_params_t;
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Function parameters for olEnqueueDataWrite
+/// @brief Function parameters for olEnqueueMemcpy
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_enqueue_memcpy_params_t {
+  ol_queue_handle_t *pQueue;
+  void **pDstPtr;
+  ol_device_handle_t *pDstDevice;
+  void **pSrcPtr;
+  ol_device_handle_t *pSrcDevice;
+  size_t *pSize;
+  ol_event_handle_t **pEventOut;
+} ol_enqueue_memcpy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olEnqueueMemcpyHtoD
 /// @details Each entry is a pointer to the parameter passed to the function;
-typedef struct ol_enqueue_data_write_params_t {
+typedef struct ol_enqueue_memcpy_hto_d_params_t {
   ol_queue_handle_t *pQueue;
   void **pDstPtr;
   void **pSrcPtr;
   size_t *pSize;
   ol_event_handle_t **pEventOut;
-} ol_enqueue_data_write_params_t;
+} ol_enqueue_memcpy_hto_d_params_t;
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Function parameters for olEnqueueDataRead
+/// @brief Function parameters for olEnqueueMemcpyDtoH
 /// @details Each entry is a pointer to the parameter passed to the function;
-typedef struct ol_enqueue_data_read_params_t {
+typedef struct ol_enqueue_memcpy_dto_h_params_t {
   ol_queue_handle_t *pQueue;
   void **pDstPtr;
   void **pSrcPtr;
   size_t *pSize;
   ol_event_handle_t **pEventOut;
-} ol_enqueue_data_read_params_t;
+} ol_enqueue_memcpy_dto_h_params_t;
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Function parameters for olEnqueueDataCopy
+/// @brief Function parameters for olEnqueueMemcpyDtoD
 /// @details Each entry is a pointer to the parameter passed to the function;
-typedef struct ol_enqueue_data_copy_params_t {
+typedef struct ol_enqueue_memcpy_dto_d_params_t {
   ol_queue_handle_t *pQueue;
   ol_device_handle_t *pDstDevice;
   void **pDstPtr;
   void **pSrcPtr;
   size_t *pSize;
   ol_event_handle_t **pEventOut;
-} ol_enqueue_data_copy_params_t;
+} ol_enqueue_memcpy_dto_d_params_t;
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for olEnqueueKernelLaunch
@@ -1262,6 +1336,13 @@ OL_APIEXPORT ol_result_t OL_APICALL olGetDeviceInfoSizeWithCodeLoc(
     ol_device_handle_t Device, ol_device_info_t PropName, size_t *PropSizeRet,
     ol_code_location_t *CodeLocation);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olGetHostDevice that also sets source code location
+/// information
+/// @details See also ::olGetHostDevice
+OL_APIEXPORT ol_result_t OL_APICALL olGetHostDeviceWithCodeLoc(
+    ol_device_handle_t *Device, ol_code_location_t *CodeLocation);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Variant of olMemAlloc that also sets source code location information
 /// @details See also ::olMemAlloc
@@ -1327,26 +1408,35 @@ OL_APIEXPORT ol_result_t OL_APICALL olWaitEventWithCodeLoc(
     ol_event_handle_t Event, ol_code_location_t *CodeLocation);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Variant of olEnqueueDataWrite that also sets source code location
+/// @brief Variant of olEnqueueMemcpy that also sets source code location
+/// information
+/// @details See also ::olEnqueueMemcpy
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueMemcpyWithCodeLoc(
+    ol_queue_handle_t Queue, void *DstPtr, ol_device_handle_t DstDevice,
+    void *SrcPtr, ol_device_handle_t SrcDevice, size_t Size,
+    ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olEnqueueMemcpyHtoD that also sets source code location
 /// information
-/// @details See also ::olEnqueueDataWrite
-OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataWriteWithCodeLoc(
+/// @details See also ::olEnqueueMemcpyHtoD
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueMemcpyHtoDWithCodeLoc(
     ol_queue_handle_t Queue, void *DstPtr, void *SrcPtr, size_t Size,
     ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Variant of olEnqueueDataRead that also sets source code location
+/// @brief Variant of olEnqueueMemcpyDtoH that also sets source code location
 /// information
-/// @details See also ::olEnqueueDataRead
-OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataReadWithCodeLoc(
+/// @details See also ::olEnqueueMemcpyDtoH
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueMemcpyDtoHWithCodeLoc(
     ol_queue_handle_t Queue, void *DstPtr, void *SrcPtr, size_t Size,
     ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Variant of olEnqueueDataCopy that also sets source code location
+/// @brief Variant of olEnqueueMemcpyDtoD that also sets source code location
 /// information
-/// @details See also ::olEnqueueDataCopy
-OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataCopyWithCodeLoc(
+/// @details See also ::olEnqueueMemcpyDtoD
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueMemcpyDtoDWithCodeLoc(
     ol_queue_handle_t Queue, ol_device_handle_t DstDevice, void *DstPtr,
     void *SrcPtr, size_t Size, ol_event_handle_t *EventOut,
     ol_code_location_t *CodeLocation);
diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc
index bd9641f74d1bb..c3104c2db735e 100644
--- a/offload/liboffload/include/generated/OffloadEntryPoints.inc
+++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc
@@ -440,6 +440,43 @@ ol_result_t olGetDeviceInfoSizeWithCodeLoc(ol_device_handle_t Device,
   return Result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olGetHostDevice_val(ol_device_handle_t *Device) {
+  if (true /*enableParameterValidation*/) {
+    if (NULL == Device) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+  }
+
+  return olGetHostDevice_impl(Device);
+}
+OL_APIEXPORT ol_result_t OL_APICALL
+olGetHostDevice(ol_device_handle_t *Device) {
+  if (offloadConfig().TracingEnabled) {
+    std::cerr << "---> olGetHostDevice";
+  }
+
+  ol_result_t Result = olGetHostDevice_val(Device);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_get_host_device_params_t Params = {&Device};
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olGetHostDeviceWithCodeLoc(ol_device_handle_t *Device,
+                                       ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olGetHostDevice(Device);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 ol_impl_result_t olMemAlloc_val(ol_device_handle_t Device, ol_alloc_type_t Type,
                                 size_t Size, void **AllocationOut) {
@@ -794,9 +831,79 @@ ol_result_t olWaitEventWithCodeLoc(ol_event_handle_t Event,
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-ol_impl_result_t olEnqueueDataWrite_val(ol_queue_handle_t Queue, void *DstPtr,
-                                        void *SrcPtr, size_t Size,
-                                        ol_event_handle_t *EventOut) {
+ol_impl_result_t olEnqueueMemcpy_val(ol_queue_handle_t Queue, void *DstPtr,
+                                     ol_device_handle_t DstDevice, void *SrcPtr,
+                                     ol_device_handle_t SrcDevice, size_t Size,
+                                     ol_event_handle_t *EventOut) {
+  if (true /*enableParameterValidation*/) {
+    if (Size == 0) {
+      return OL_ERRC_INVALID_SIZE;
+    }
+
+    if (NULL == Queue) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == DstDevice) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == SrcDevice) {
+      return OL_ERRC_INVALID_NULL_HANDLE;
+    }
+
+    if (NULL == DstPtr) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+
+    if (NULL == SrcPtr) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+  }
+
+  return olEnqueueMemcpy_impl(Queue, DstPtr, DstDevice, SrcPtr, SrcDevice, Size,
+                              EventOut);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueMemcpy(
+    ol_queue_handle_t Queue, void *DstPtr, ol_device_handle_t DstDevice,
+    void *SrcPtr, ol_device_handle_t SrcDevice, size_t Size,
+    ol_event_handle_t *EventOut) {
+  if (offloadConfig().TracingEnabled) {
+    std::cerr << "---> olEnqueueMemcpy";
+  }
+
+  ol_result_t Result = olEnqueueMemcpy_val(Queue, DstPtr, DstDevice, SrcPtr,
+                                           SrcDevice, Size, EventOut);
+
+  if (offloadConfig().TracingEnabled) {
+    ol_enqueue_memcpy_params_t Params = {
+        &Queue, &DstPtr, &DstDevice, &SrcPtr, &SrcDevice, &Size, &EventOut};
+    std::cerr << "(" << &Params << ")";
+    std::cerr << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      std::cerr << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olEnqueueMemcpyWithCodeLoc(ol_queue_handle_t Queue, void *DstPtr,
+                                       ol_device_handle_t DstDevice,
+                                       void *SrcPtr,
+                                       ol_device_handle_t SrcDevice,
+                                       size_t Size, ol_event_handle_t *EventOut,
+                                       ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = olEnqueueMemcpy(Queue, DstPtr, DstDevice, SrcPtr,
+                                       SrcDevice, Size, EventOut);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+ol_impl_result_t olEnqueueMemcpyHtoD_val(ol_queue_handle_t Queue, void *DstPtr,
+                                         void *SrcPtr, size_t Size,
+                                         ol_event_handle_t *EventOut) {
   if (true /*enableParameterValidation*/) {
     if (Size == 0) {
       return OL_ERRC_INVALID_SIZE;
@@ -815,21 +922,21 @@ ol_impl_result_t olEnqueueDataWrite_val(ol_queue_handle_t Queue, void *DstPtr,
     }
   }
 
-  return olEnqueueDataWrite_impl(Queue, DstPtr, SrcPtr, Size, EventOut);
+  return olEnqueueMemcpyHtoD_impl(Queue, DstPtr, SrcPtr, Size, EventOut);
 }
 OL_APIEXPORT ol_result_t OL_APICALL
-olEnqueueDataWrite(ol_queue_handle_t Queue, void *DstPtr, void *SrcPtr,
-                   size_t Size, ol_event_handle_t *EventOut) {
+olEnqueueMemcpyHtoD(ol_queue_handle_t Queue, void *DstPtr, void *SrcPtr,
+                    size_t Size, ol_event_handle_t *EventOut) {
   if (offloadConfig().TracingEnabled) {
-    std::cerr << "---> olEnqueueDataWrite";
+    std::cerr << "---> olEnqueueMemcpyHtoD";
   }
 
   ol_result_t Result =
-      olEnqueueDataWrite_val(Queue, DstPtr, SrcPtr, Size, EventOut);
+      olEnqueueMemcpyHtoD_val(Queue, DstPtr, SrcPtr, Size, EventOut);
 
   if (offloadConfig().TracingEnabled) {
-    ol_enqueue_data_write_params_t Params = {&Queue, &DstPtr, &SrcPtr, &Size,
-                                             &EventOut};
+    ol_enqueue_memcpy_hto_d_params_t Params = {&Queue, &DstPtr, &SrcPtr, &Size,
+                                               &EventOut};
     std::cerr << "(" << &Params << ")";
     std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
@@ -838,22 +945,23 @@ olEnqueueDataWrite(ol_queue_handle_t Queue, void *DstPtr, void *SrcPtr,
   }
   return Result;
 }
-ol_result_t olEnqueueDataWriteWithCodeLoc(ol_queue_handle_t Queue, void *DstPtr,
-                                          void *SrcPtr, size_t Size,
-                                          ol_event_handle_t *EventOut,
-                                          ol_code_location_t *CodeLocation) {
+ol_result_t olEnqueueMemcpyHtoDWithCodeLoc(ol_queue_handle_t Queue,
+                                           void *DstPtr, void *SrcPtr,
+                                           size_t Size,
+                                           ol_event_handle_t *EventOut,
+                                           ol_code_location_t *CodeLocation) {
   currentCodeLocation() = CodeLocation;
   ol_result_t Result =
-      olEnqueueDataWrite(Queue, DstPtr, SrcPtr, Size, EventOut);
+      olEnqueueMemcpyHtoD(Queue, DstPtr, SrcPtr, Size, EventOut);
 
   currentCodeLocation() = nullptr;
   return Result;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-ol_impl_result_t olEnqueueDataRead_val(ol_queue_handle_t Queue, void *DstPtr,
-                                       void *SrcPtr, size_t Size,
-                                       ol_event_handle_t *EventOut) {
+ol_impl_result_t olEnqueueMemcpyDtoH_val(ol_queue_handle_t Queue, void *DstPtr,
+                                         void *SrcPtr, size_t Size,
+                                         ol_event_handle_t *EventOut) {
   if (true /*enableParameterValidation*/) {
     if (NULL == Queue) {
       return OL_ERRC_INVALID_NULL_HANDLE;
@@ -868,21 +976,21 @@ ol_impl_result_t olEnqueueDataRead_val(ol_queue_handle_t Queue, void *DstPtr,
     }
   }
 
-  return olEnqueueDataRead_impl(Queue, DstPtr, SrcPtr, Size, EventOut);
+  return olEnqueueMemcpyDtoH_impl(Queue, DstPtr, SrcPtr, Size, EventOut);
 }
 OL_APIEXPORT ol_result_t OL_APICALL
-olEnqueueDataRead(ol_queue_handle_t Queue, void *DstPtr, void *SrcPtr,
-                  size_t Size, ol_event_handle_t *EventOut) {
+olEnqueueMemcpyDtoH(ol_queue_handle_t Queue, void *DstPtr, void *SrcPtr,
+                    size_t Size, ol_event_handle_t *EventOut) {
   if (offloadConfig().TracingEnabled) {
-    std::cerr << "---> olEnqueueDataRead";
+    std::cerr << "---> olEnqueueMemcpyDtoH";
   }
 
   ol_result_t Result =
-      olEnqueueDataRead_val(Queue, DstPtr, SrcPtr, Size, EventOut);
+      olEnqueueMemcpyDtoH_val(Queue, DstPtr, SrcPtr, Size, EventOut);
 
   if (offloadConfig().TracingEnabled) {
-    ol_enqueue_data_read_params_t Params = {&Queue, &DstPtr, &SrcPtr, &Size,
-                                            &EventOut};
+    ol_enqueue_memcpy_dto_h_params_t Params = {&Queue, &DstPtr, &SrcPtr, &Size,
+                                               &EventOut};
     std::cerr << "(" << &Params << ")";
     std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
@@ -891,22 +999,25 @@ olEnqueueDataRead(ol_queue_handle_t Queue, void *DstPtr, void *SrcPtr,
   }
   return Result;
 }
-ol_result_t olEnqueueDataReadWithCodeLoc(ol_queue_handle_t Queue, void *DstPtr,
-                                         void *SrcPtr, size_t Size,
-                                         ol_event_handle_t *EventOut,
-                                         ol_code_location_t *CodeLocation) {
+ol_result_t olEnqueueMemcpyDtoHWithCodeLoc(ol_queue_handle_t Queue,
+                                           void *DstPtr, void *SrcPtr,
+                                           size_t Size,
+                                           ol_event_handle_t *EventOut,
+                                           ol_code_location_t *CodeLocation) {
   currentCodeLocation() = CodeLocation;
-  ol_result_t Result = olEnqueueDataRead(Queue, DstPtr, SrcPtr, Size, EventOut);
+  ol_result_t Result =
+      olEnqueueMemcpyDtoH(Queue, DstPtr, SrcPtr, Size, EventOut);
 
   currentCodeLocation() = nullptr;
   return Result;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-ol_impl_result_t olEnqueueDataCopy_val(ol_queue_handle_t Queue,
-                                       ol_device_handle_t DstDevice,
-                                       void *DstPtr, void *SrcPtr, size_t Size,
-                                       ol_event_handle_t *EventOut) {
+ol_impl_result_t olEnqueueMemcpyDtoD_val(ol_queue_handle_t Queue,
+                                         ol_device_handle_t DstDevice,
+                                         void *DstPtr, void *SrcPtr,
+                                         size_t Size,
+                                         ol_event_handle_t *EventOut) {
   if (true /*enableParameterValidation*/) {
     if (NULL == Queue) {
       return OL_ERRC_INVALID_NULL_HANDLE;
@@ -925,22 +1036,22 @@ ol_impl_result_t olEnqueueDataCopy_val(ol_queue_handle_t Queue,
     }
   }
 
-  return olEnqueueDataCopy_impl(Queue, DstDevice, DstPtr, SrcPtr, Size,
-                                EventOut);
+  return olEnqueueMemcpyDtoD_impl(Queue, DstDevice, DstPtr, SrcPtr, Size,
+                                  EventOut);
 }
-OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataCopy(
+OL_APIEXPORT ol_result_t OL_APICALL olEnqueueMemcpyDtoD(
     ol_queue_handle_t Queue, ol_device_handle_t DstDevice, void *DstPtr,
     void *SrcPtr, size_t Size, ol_event_handle_t *EventOut) {
   if (offloadConfig().TracingEnabled) {
-    std::cerr << "---> olEnqueueDataCopy";
+    std::cerr << "---> olEnqueueMemcpyDtoD";
   }
 
   ol_result_t Result =
-      olEnqueueDataCopy_val(Queue, DstDevice, DstPtr, SrcPtr, Size, EventOut);
+      olEnqueueMemcpyDtoD_val(Queue, DstDevice, DstPtr, SrcPtr, Size, EventOut);
 
   if (offloadConfig().TracingEnabled) {
-    ol_enqueue_data_copy_params_t Params = {&Queue,  &DstDevice, &DstPtr,
-                                            &SrcPtr, &Size,      &EventOut};
+    ol_enqueue_memcpy_dto_d_params_t Params = {&Queue,  &DstDevice, &DstPtr,
+                                               &SrcPtr, &Size,      &EventOut};
     std::cerr << "(" << &Params << ")";
     std::cerr << "-> " << Result << "\n";
     if (Result && Result->Details) {
@@ -949,15 +1060,15 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueDataCopy(
   }
   return Result;
 }
-ol_result_t olEnqueueDataCopyWithCodeLoc(ol_queue_handle_t Queue,
-                                         ol_device_handle_t DstDevice,
-                                         void *DstPtr, void *SrcPtr,
-                                         size_t Size,
-                                         ol_event_handle_t *EventOut,
-                                         ol_code_location_t *CodeLocation) {
+ol_result_t olEnqueueMemcpyDtoDWithCodeLoc(ol_queue_handle_t Queue,
+                                           ol_device_handle_t DstDevice,
+                                           void *DstPtr, void *SrcPtr,
+                                           size_t Size,
+                                           ol_event_handle_t *EventOut,
+                                           ol_code_location_t *CodeLocation) {
   currentCodeLocation() = CodeLocation;
   ol_result_t Result =
-      olEnqueueDataCopy(Queue, DstDevice, DstPtr, SrcPtr, Size, EventOut);
+      olEnqueueMemcpyDtoD(Queue, DstDevice, DstPtr, SrcPtr, Size, EventOut);
 
   currentCodeLocation() = nullptr;
   return Result;
diff --git a/offload/liboffload/include/generated/OffloadFuncs.inc b/offload/liboffload/include/generated/OffloadFuncs.inc
index 05a8e47251254..6307d0a54b59e 100644
--- a/offload/liboffload/include/generated/OffloadFuncs.inc
+++ b/offload/liboffload/include/generated/OffloadFuncs.inc
@@ -20,6 +20,7 @@ OFFLOAD_FUNC(olGetDeviceCount)
 OFFLOAD_FUNC(olGetDevice)
 OFFLOAD_FUNC(olGetDeviceInfo)
 OFFLOAD_FUNC(olGetDeviceInfoSize)
+OFFLOAD_FUNC(olGetHostDevice)
 OFFLOAD_FUNC(olMemAlloc)
 OFFLOAD_FUNC(olMemFree)
 OFFLOAD_FUNC(olCreateQueue)
@@ -29,9 +30,10 @@ OFFLOAD_FUNC(olFinishQueue)
 OFFLOAD_FUNC(olRetainEvent)
 OFFLOAD_FUNC(olReleaseEvent)
 OFFLOAD_FUNC(olWaitEvent)
-OFFLOAD_FUNC(olEnqueueDataWrite)
-OFFLOAD_FUNC(olEnqueueDataRead)
-OFFLOAD_FUNC(olEnqueueDataCopy)
+OFFLOAD_FUNC(olEnqueueMemcpy)
+OFFLOAD_FUNC(olEnqueueMemcpyHtoD)
+OFFLOAD_FUNC(olEnqueueMemcpyDtoH)
+OFFLOAD_FUNC(olEnqueueMemcpyDtoD)
 OFFLOAD_FUNC(olEnqueueKernelLaunch)
 OFFLOAD_FUNC(olCreateProgram)
 OFFLOAD_FUNC(olRetainProgram)
@@ -51,6 +53,7 @@ OFFLOAD_FUNC(olGetDeviceCountWithCodeLoc)
 OFFLOAD_FUNC(olGetDeviceWithCodeLoc)
 OFFLOAD_FUNC(olGetDeviceInfoWithCodeLoc)
 OFFLOAD_FUNC(olGetDeviceInfoSizeWithCodeLoc)
+OFFLOAD_FUNC(olGetHostDeviceWithCodeLoc)
 OFFLOAD_FUNC(olMemAllocWithCodeLoc)
 OFFLOAD_FUNC(olMemFreeWithCodeLoc)
 OFFLOAD_FUNC(olCreateQueueWithCodeLoc)
@@ -60,9 +63,10 @@ OFFLOAD_FUNC(olFinishQueueWithCodeLoc)
 OFFLOAD_FUNC(olRetainEventWithCodeLoc)
 OFFLOAD_FUNC(olReleaseEventWithCodeLoc)
 OFFLOAD_FUNC(olWaitEventWithCodeLoc)
-OFFLOAD_FUNC(olEnqueueDataWriteWithCodeLoc)
-OFFLOAD_FUNC(olEnqueueDataReadWithCodeLoc)
-OFFLOAD_FUNC(olEnqueueDataCopyWithCodeLoc)
+OFFLOAD_FUNC(olEnqueueMemcpyWithCodeLoc)
+OFFLOAD_FUNC(olEnqueueMemcpyHtoDWithCodeLoc)
+OFFLOAD_FUNC(olEnqueueMemcpyDtoHWithCodeLoc)
+OFFLOAD_FUNC(olEnqueueMemcpyDtoDWithCodeLoc)
 OFFLOAD_FUNC(olEnqueueKernelLaunchWithCodeLoc)
 OFFLOAD_FUNC(olCreateProgramWithCodeLoc)
 OFFLOAD_FUNC(olRetainProgramWithCodeLoc)
diff --git a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
index 976422f3d7fd5..1fb77e3d278c9 100644
--- a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
+++ b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
@@ -37,6 +37,8 @@ ol_impl_result_t olGetDeviceInfoSize_impl(ol_device_handle_t Device,
                                           ol_device_info_t PropName,
                                           size_t *PropSizeRet);
 
+ol_impl_result_t olGetHostDevice_impl(ol_device_handle_t *Device);
+
 ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device,
                                  ol_alloc_type_t Type, size_t Size,
                                  void **AllocationOut);
@@ -59,18 +61,25 @@ ol_impl_result_t olReleaseEvent_impl(ol_event_handle_t Event);
 
 ol_impl_result_t olWaitEvent_impl(ol_event_handle_t Event);
 
-ol_impl_result_t olEnqueueDataWrite_impl(ol_queue_handle_t Queue, void *DstPtr,
-                                         void *SrcPtr, size_t Size,
-                                         ol_event_handle_t *EventOut);
-
-ol_impl_result_t olEnqueueDataRead_impl(ol_queue_handle_t Queue, void *DstPtr,
-                                        void *SrcPtr, size_t Size,
-                                        ol_event_handle_t *EventOut);
-
-ol_impl_result_t olEnqueueDataCopy_impl(ol_queue_handle_t Queue,
-                                        ol_device_handle_t DstDevice,
-                                        void *DstPtr, void *SrcPtr, size_t Size,
-                                        ol_event_handle_t *EventOut);
+ol_impl_result_t olEnqueueMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
+                                      ol_device_handle_t DstDevice,
+                                      void *SrcPtr,
+                                      ol_device_handle_t SrcDevice, size_t Size,
+                                      ol_event_handle_t *EventOut);
+
+ol_impl_result_t olEnqueueMemcpyHtoD_impl(ol_queue_handle_t Queue, void *DstPtr,
+                                          void *SrcPtr, size_t Size,
+                                          ol_event_handle_t *EventOut);
+
+ol_impl_result_t olEnqueueMemcpyDtoH_impl(ol_queue_handle_t Queue, void *DstPtr,
+                                          void *SrcPtr, size_t Size,
+                                          ol_event_handle_t *EventOut);
+
+ol_impl_result_t olEnqueueMemcpyDtoD_impl(ol_queue_handle_t Queue,
+                                          ol_device_handle_t DstDevice,
+                                          void *DstPtr, void *SrcPtr,
+                                          size_t Size,
+                                          ol_event_handle_t *EventOut);
 
 ol_impl_result_t
 olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
diff --git a/offload/liboffload/include/generated/OffloadPrint.hpp b/offload/liboffload/include/generated/OffloadPrint.hpp
index 10dc58d1079a0..a5616f85ea8a3 100644
--- a/offload/liboffload/include/generated/OffloadPrint.hpp
+++ b/offload/liboffload/include/generated/OffloadPrint.hpp
@@ -475,6 +475,13 @@ operator<<(std::ostream &os,
   return os;
 }
 
+inline std::ostream &
+operator<<(std::ostream &os, const struct ol_get_host_device_params_t *params) {
+  os << ".Device = ";
+  printPtr(os, *params->pDevice);
+  return os;
+}
+
 inline std::ostream &operator<<(std::ostream &os,
                                 const struct ol_mem_alloc_params_t *params) {
   os << ".Device = ";
@@ -556,9 +563,34 @@ inline std::ostream &operator<<(std::ostream &os,
   return os;
 }
 
+inline std::ostream &
+operator<<(std::ostream &os, const struct ol_enqueue_memcpy_params_t *params) {
+  os << ".Queue = ";
+  printPtr(os, *params->pQueue);
+  os << ", ";
+  os << ".DstPtr = ";
+  printPtr(os, *params->pDstPtr);
+  os << ", ";
+  os << ".DstDevice = ";
+  printPtr(os, *params->pDstDevice);
+  os << ", ";
+  os << ".SrcPtr = ";
+  printPtr(os, *params->pSrcPtr);
+  os << ", ";
+  os << ".SrcDevice = ";
+  printPtr(os, *params->pSrcDevice);
+  os << ", ";
+  os << ".Size = ";
+  os << *params->pSize;
+  os << ", ";
+  os << ".EventOut = ";
+  printPtr(os, *params->pEventOut);
+  return os;
+}
+
 inline std::ostream &
 operator<<(std::ostream &os,
-           const struct ol_enqueue_data_write_params_t *params) {
+           const struct ol_enqueue_memcpy_hto_d_params_t *params) {
   os << ".Queue = ";
   printPtr(os, *params->pQueue);
   os << ", ";
@@ -578,7 +610,7 @@ operator<<(std::ostream &os,
 
 inline std::ostream &
 operator<<(std::ostream &os,
-           const struct ol_enqueue_data_read_params_t *params) {
+           const struct ol_enqueue_memcpy_dto_h_params_t *params) {
   os << ".Queue = ";
   printPtr(os, *params->pQueue);
   os << ", ";
@@ -598,7 +630,7 @@ operator<<(std::ostream &os,
 
 inline std::ostream &
 operator<<(std::ostream &os,
-           const struct ol_enqueue_data_copy_params_t *params) {
+           const struct ol_enqueue_memcpy_dto_d_params_t *params) {
   os << ".Queue = ";
   printPtr(os, *params->pQueue);
   os << ", ";
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index d395eb34a5f4a..63d5bdb1e8f61 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -27,7 +27,7 @@ using namespace llvm::omp::target::plugin;
 // interface.
 struct ol_device_impl_t {
   int DeviceNum;
-  GenericDeviceTy &Device;
+  GenericDeviceTy *Device;
   ol_platform_handle_t Platform;
 };
 
@@ -45,7 +45,6 @@ struct ol_queue_impl_t {
 struct ol_event_impl_t {
   void *EventInfo;
   ol_queue_handle_t Queue;
-  ol_device_handle_t Device;
   std::atomic_uint32_t RefCount;
 };
 
@@ -107,6 +106,11 @@ PlatformVecT &Platforms() {
   return Platforms;
 }
 
+ol_device_handle_t HostDevice() {
+  static ol_device_impl_t HostDeviceImpl{-1, nullptr, nullptr};
+  return &HostDeviceImpl;
+}
+
 // TODO: Some plugins expect to be linked into libomptarget which defines these
 // symbols to implement ompt callbacks. The least invasive workaround here is to
 // define them in libLLVMOffload as false/null so they are never used. In future
@@ -144,7 +148,7 @@ void initPlugins() {
          DevNum++) {
       if (Platform.Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) {
         Platform.Devices.emplace_back(ol_device_impl_t{
-            DevNum, Platform.Plugin->getDevice(DevNum), &Platform});
+            DevNum, &Platform.Plugin->getDevice(DevNum), &Platform});
       }
     }
   }
@@ -260,7 +264,7 @@ ol_impl_result_t olGetDeviceInfoImplDetail(ol_device_handle_t Device,
   ReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet);
 
   InfoQueueTy DevInfo;
-  if (auto Err = Device->Device.obtainInfoImpl(DevInfo))
+  if (auto Err = Device->Device->obtainInfoImpl(DevInfo))
     return OL_ERRC_OUT_OF_RESOURCES;
 
   // Find the info if it exists under any of the given names
@@ -312,6 +316,11 @@ ol_impl_result_t olGetDeviceInfoSize_impl(ol_device_handle_t Device,
   return olGetDeviceInfoImplDetail(Device, PropName, 0, nullptr, PropSizeRet);
 }
 
+ol_impl_result_t olGetHostDevice_impl(ol_device_handle_t *Device) {
+  *Device = HostDevice();
+  return OL_SUCCESS;
+}
+
 TargetAllocTy convertOlToPluginAllocTy(ol_alloc_type_t Type) {
   switch (Type) {
   case OL_ALLOC_TYPE_DEVICE:
@@ -328,7 +337,7 @@ ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device,
                                  ol_alloc_type_t Type, size_t Size,
                                  void **AllocationOut) {
   auto Alloc =
-      Device->Device.dataAlloc(Size, nullptr, convertOlToPluginAllocTy(Type));
+      Device->Device->dataAlloc(Size, nullptr, convertOlToPluginAllocTy(Type));
   if (!Alloc)
     return {OL_ERRC_OUT_OF_RESOURCES,
             formatv("Could not create allocation on device {0}", Device).str()};
@@ -339,7 +348,8 @@ ol_impl_result_t olMemAlloc_impl(ol_device_handle_t Device,
 
 ol_impl_result_t olMemFree_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
                                 void *Address) {
-  auto Res = Device->Device.dataDelete(Address, convertOlToPluginAllocTy(Type));
+  auto Res =
+      Device->Device->dataDelete(Address, convertOlToPluginAllocTy(Type));
   if (Res)
     return {OL_ERRC_OUT_OF_RESOURCES, "Could not free allocation"};
 
@@ -349,7 +359,7 @@ ol_impl_result_t olMemFree_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
 ol_impl_result_t olCreateQueue_impl(ol_device_handle_t Device,
                                     ol_queue_handle_t *Queue) {
   auto CreatedQueue = std::make_unique<ol_queue_impl_t>();
-  auto Err = Device->Device.initAsyncInfo(&(CreatedQueue->AsyncInfo));
+  auto Err = Device->Device->initAsyncInfo(&(CreatedQueue->AsyncInfo));
   if (Err)
     return {OL_ERRC_UNKNOWN, "Could not initialize stream resource"};
 
@@ -375,7 +385,7 @@ ol_impl_result_t olFinishQueue_impl(ol_queue_handle_t Queue) {
   // Host plugin doesn't have a queue set so it's not safe to call synchronize
   // on it, but we have nothing to synchronize in that situation anyway.
   if (Queue->AsyncInfo->Queue) {
-    auto Err = Queue->Device->Device.synchronize(Queue->AsyncInfo);
+    auto Err = Queue->Device->Device->synchronize(Queue->AsyncInfo);
     if (Err)
       return {OL_ERRC_INVALID_QUEUE, "The queue failed to synchronize"};
   }
@@ -383,7 +393,7 @@ ol_impl_result_t olFinishQueue_impl(ol_queue_handle_t Queue) {
   // Recreate the stream resource so the queue can be reused
   // TODO: Would be easier for the synchronization to (optionally) not release
   // it to begin with.
-  auto Res = Queue->Device->Device.initAsyncInfo(&Queue->AsyncInfo);
+  auto Res = Queue->Device->Device->initAsyncInfo(&Queue->AsyncInfo);
   if (Res)
     return {OL_ERRC_UNKNOWN, "Could not reinitialize the stream resource"};
 
@@ -391,7 +401,7 @@ ol_impl_result_t olFinishQueue_impl(ol_queue_handle_t Queue) {
 }
 
 ol_impl_result_t olWaitEvent_impl(ol_event_handle_t Event) {
-  auto Res = Event->Device->Device.syncEvent(Event->EventInfo);
+  auto Res = Event->Queue->Device->Device->syncEvent(Event->EventInfo);
   if (Res)
     return {OL_ERRC_INVALID_EVENT, "The event failed to synchronize"};
 
@@ -413,24 +423,59 @@ ol_impl_result_t olReleaseEvent_impl(ol_event_handle_t Event) {
 ol_event_handle_t makeEvent(ol_queue_handle_t Queue) {
   auto EventImpl = std::make_unique<ol_event_impl_t>();
   EventImpl->Queue = Queue;
-  auto Res = Queue->Device->Device.createEvent(&EventImpl->EventInfo);
+  auto Res = Queue->Device->Device->createEvent(&EventImpl->EventInfo);
   if (Res)
     return nullptr;
 
-  Res =
-      Queue->Device->Device.recordEvent(EventImpl->EventInfo, Queue->AsyncInfo);
+  Res = Queue->Device->Device->recordEvent(EventImpl->EventInfo,
+                                           Queue->AsyncInfo);
   if (Res)
     return nullptr;
 
   return EventImpl.release();
 }
 
-ol_impl_result_t olEnqueueDataWrite_impl(ol_queue_handle_t Queue, void *DstPtr,
-                                         void *SrcPtr, size_t Size,
-                                         ol_event_handle_t *EventOut) {
-  auto &DeviceImpl = Queue->Device->Device;
+ol_impl_result_t olEnqueueMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
+                                      ol_device_handle_t DstDevice,
+                                      void *SrcPtr,
+                                      ol_device_handle_t SrcDevice, size_t Size,
+                                      ol_event_handle_t *EventOut) {
+  if (DstDevice == HostDevice() && SrcDevice == HostDevice()) {
+    // TODO: We could actually handle this with a plain memcpy but we currently
+    // have no way of synchronizing this with the queue
+    return {OL_ERRC_INVALID_ARGUMENT,
+            "One of DstDevice and SrcDevice must be a non-host device"};
+  }
+
+  if (DstDevice == HostDevice()) {
+    auto Res =
+        SrcDevice->Device->dataRetrieve(DstPtr, SrcPtr, Size, Queue->AsyncInfo);
+    if (Res)
+      return {OL_ERRC_UNKNOWN, "The data retrieve operation failed"};
+  } else if (SrcDevice == HostDevice()) {
+    auto Res =
+        DstDevice->Device->dataSubmit(DstPtr, SrcPtr, Size, Queue->AsyncInfo);
+    if (Res)
+      return {OL_ERRC_UNKNOWN, "The data submit operation failed"};
+  } else {
+    auto Res = SrcDevice->Device->dataExchange(SrcPtr, *DstDevice->Device,
+                                               DstPtr, Size, Queue->AsyncInfo);
+    if (Res)
+      return {OL_ERRC_UNKNOWN, "The data exchange operation failed"};
+  }
+
+  if (EventOut)
+    *EventOut = makeEvent(Queue);
+
+  return OL_SUCCESS;
+}
+
+ol_impl_result_t olEnqueueMemcpyHtoD_impl(ol_queue_handle_t Queue, void *DstPtr,
+                                          void *SrcPtr, size_t Size,
+                                          ol_event_handle_t *EventOut) {
+  auto *DeviceImpl = Queue->Device->Device;
 
-  auto Res = DeviceImpl.dataSubmit(DstPtr, SrcPtr, Size, Queue->AsyncInfo);
+  auto Res = DeviceImpl->dataSubmit(DstPtr, SrcPtr, Size, Queue->AsyncInfo);
 
   if (Res)
     return {OL_ERRC_UNKNOWN, "The data submit operation failed"};
@@ -441,12 +486,12 @@ ol_impl_result_t olEnqueueDataWrite_impl(ol_queue_handle_t Queue, void *DstPtr,
   return OL_SUCCESS;
 }
 
-ol_impl_result_t olEnqueueDataRead_impl(ol_queue_handle_t Queue, void *DstPtr,
-                                        void *SrcPtr, size_t Size,
-                                        ol_event_handle_t *EventOut) {
-  auto &DeviceImpl = Queue->Device->Device;
+ol_impl_result_t olEnqueueMemcpyDtoH_impl(ol_queue_handle_t Queue, void *DstPtr,
+                                          void *SrcPtr, size_t Size,
+                                          ol_event_handle_t *EventOut) {
+  auto *DeviceImpl = Queue->Device->Device;
 
-  auto Res = DeviceImpl.dataRetrieve(DstPtr, SrcPtr, Size, Queue->AsyncInfo);
+  auto Res = DeviceImpl->dataRetrieve(DstPtr, SrcPtr, Size, Queue->AsyncInfo);
 
   if (Res)
     return {OL_ERRC_UNKNOWN, "The data retrieve operation failed"};
@@ -457,14 +502,15 @@ ol_impl_result_t olEnqueueDataRead_impl(ol_queue_handle_t Queue, void *DstPtr,
   return OL_SUCCESS;
 }
 
-ol_impl_result_t olEnqueueDataCopy_impl(ol_queue_handle_t Queue,
-                                        ol_device_handle_t DstDevice,
-                                        void *DstPtr, void *SrcPtr, size_t Size,
-                                        ol_event_handle_t *EventOut) {
-  auto &DeviceImpl = Queue->Device->Device;
+ol_impl_result_t olEnqueueMemcpyDtoD_impl(ol_queue_handle_t Queue,
+                                          ol_device_handle_t DstDevice,
+                                          void *DstPtr, void *SrcPtr,
+                                          size_t Size,
+                                          ol_event_handle_t *EventOut) {
+  auto *DeviceImpl = Queue->Device->Device;
 
-  auto Res = DeviceImpl.dataExchange(SrcPtr, DstDevice->Device, DstPtr, Size,
-                                     Queue->AsyncInfo);
+  auto Res = DeviceImpl->dataExchange(SrcPtr, *DstDevice->Device, DstPtr, Size,
+                                      Queue->AsyncInfo);
 
   if (Res)
     return {OL_ERRC_UNKNOWN, "The data exchange operation failed"};
@@ -489,7 +535,7 @@ ol_impl_result_t olCreateProgram_impl(ol_device_handle_t Device, void *ProgData,
 
   ol_program_handle_t Prog = new ol_program_impl_t();
 
-  auto Res = Device->Device.loadBinary(Device->Device.Plugin, &DeviceImage);
+  auto Res = Device->Device->loadBinary(Device->Device->Plugin, &DeviceImage);
   if (!Res)
     return OL_ERRC_INVALID_VALUE;
 
@@ -559,9 +605,9 @@ ol_impl_result_t
 olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
                            const ol_kernel_launch_size_args_t *LaunchSizeArgs,
                            ol_event_handle_t *EventOut) {
-  auto &DeviceImpl = Queue->Device->Device;
+  auto *DeviceImpl = Queue->Device->Device;
 
-  AsyncInfoWrapperTy AsyncInfoWrapper(DeviceImpl, Queue->AsyncInfo);
+  AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, Queue->AsyncInfo);
 
   KernelArgsTy LaunchArgs{};
   LaunchArgs.NumArgs = Kernel->Args.getPointers().size();
@@ -578,7 +624,7 @@ olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
   // No offsets needed, arguments are real pointers
   auto ArgOffsets = std::vector<ptrdiff_t>(LaunchArgs.NumArgs, 0ul);
 
-  auto Err = Kernel->KernelImpl->launch(DeviceImpl, LaunchArgs.ArgPtrs,
+  auto Err = Kernel->KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs,
                                         ArgOffsets.data(), LaunchArgs,
                                         AsyncInfoWrapper);
 
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index c7f28d147db14..d942d244beadd 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -20,6 +20,7 @@ add_libompt_unittest("offload.unittests"
     ${CMAKE_CURRENT_SOURCE_DIR}/enqueue/olEnqueueDataWrite.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/enqueue/olEnqueueDataRead.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/enqueue/olEnqueueDataCopy.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/enqueue/olEnqueueMemcpy.cpp
     )
 add_dependencies("offload.unittests" ${PLUGINS_TEST_COMMON})
 target_link_libraries("offload.unittests" PRIVATE ${PLUGINS_TEST_COMMON})
diff --git a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp
index dc2791266fa14..84aa88009fce8 100644
--- a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp
+++ b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataCopy.cpp
@@ -1,4 +1,4 @@
-//===------- Offload API tests - olEnqueueDataCopy ------------------------===//
+//===------- Offload API tests - olEnqueueMemcpyDtoD ----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,9 +10,9 @@
 #include <OffloadAPI.h>
 #include <gtest/gtest.h>
 
-using olEnqueueDataCopyTest = offloadQueueTest;
+using olEnqueueMemcpyDtoDTest = offloadQueueTest;
 
-TEST_F(olEnqueueDataCopyTest, Success) {
+TEST_F(olEnqueueMemcpyDtoDTest, Success) {
   constexpr size_t Size = 1024;
   void *AllocA;
   void *AllocB;
@@ -22,11 +22,11 @@ TEST_F(olEnqueueDataCopyTest, Success) {
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &AllocA));
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &AllocB));
   ASSERT_SUCCESS(
-      olEnqueueDataWrite(Queue, AllocA, Input.data(), Size, nullptr));
+      olEnqueueMemcpyHtoD(Queue, AllocA, Input.data(), Size, nullptr));
   ASSERT_SUCCESS(
-      olEnqueueDataCopy(Queue, Device, AllocB, AllocA, Size, nullptr));
+      olEnqueueMemcpyDtoD(Queue, Device, AllocB, AllocA, Size, nullptr));
   ASSERT_SUCCESS(
-      olEnqueueDataRead(Queue, Output.data(), AllocB, Size, nullptr));
+      olEnqueueMemcpyDtoH(Queue, Output.data(), AllocB, Size, nullptr));
   ASSERT_SUCCESS(olFinishQueue(Queue));
   for (uint8_t Val : Output) {
     ASSERT_EQ(Val, 42);
diff --git a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp
index 71323e4b44817..d9e2be4146934 100644
--- a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp
+++ b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataRead.cpp
@@ -1,4 +1,4 @@
-//===------- Offload API tests - olEnqueueDataRead ------------------------===//
+//===------- Offload API tests - olEnqueueMemcpyDtoH ----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,17 +10,19 @@
 #include <OffloadAPI.h>
 #include <gtest/gtest.h>
 
-using olEnqueueDataReadTest = offloadQueueTest;
+using olEnqueueMemcpyDtoHTest = offloadQueueTest;
 
-TEST_F(olEnqueueDataReadTest, Success) {
+TEST_F(olEnqueueMemcpyDtoHTest, Success) {
   constexpr size_t Size = 1024;
   void *Alloc;
   std::vector<uint8_t> Input(Size, 42);
   std::vector<uint8_t> Output(Size, 0);
 
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &Alloc));
-  ASSERT_SUCCESS(olEnqueueDataWrite(Queue, Alloc, Input.data(), Size, nullptr));
-  ASSERT_SUCCESS(olEnqueueDataRead(Queue, Output.data(), Alloc, Size, nullptr));
+  ASSERT_SUCCESS(
+      olEnqueueMemcpyHtoD(Queue, Alloc, Input.data(), Size, nullptr));
+  ASSERT_SUCCESS(
+      olEnqueueMemcpyDtoH(Queue, Output.data(), Alloc, Size, nullptr));
   ASSERT_SUCCESS(olFinishQueue(Queue));
   for (uint8_t Val : Output) {
     ASSERT_EQ(Val, 42);
diff --git a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp
index f60d501a9918d..81d338abcbd8c 100644
--- a/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp
+++ b/offload/unittests/OffloadAPI/enqueue/olEnqueueDataWrite.cpp
@@ -1,4 +1,4 @@
-//===------- Offload API tests - olEnqueueDataWrite -----------------------===//
+//===------- Offload API tests - olEnqueueMemcpyHtoD ----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,14 +10,15 @@
 #include <OffloadAPI.h>
 #include <gtest/gtest.h>
 
-using olEnqueueDataWriteTest = offloadQueueTest;
+using olEnqueueMemcpyHtoDTest = offloadQueueTest;
 
-TEST_F(olEnqueueDataWriteTest, Success) {
+TEST_F(olEnqueueMemcpyHtoDTest, Success) {
   constexpr size_t Size = 1024;
   void *Alloc;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &Alloc));
   std::vector<uint8_t> Input(Size, 42);
-  ASSERT_SUCCESS(olEnqueueDataWrite(Queue, Alloc, Input.data(), Size, nullptr));
+  ASSERT_SUCCESS(
+      olEnqueueMemcpyHtoD(Queue, Alloc, Input.data(), Size, nullptr));
   olFinishQueue(Queue);
   olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc);
 }
diff --git a/offload/unittests/OffloadAPI/enqueue/olEnqueueMemcpy.cpp b/offload/unittests/OffloadAPI/enqueue/olEnqueueMemcpy.cpp
new file mode 100644
index 0000000000000..13be31fc8d801
--- /dev/null
+++ b/offload/unittests/OffloadAPI/enqueue/olEnqueueMemcpy.cpp
@@ -0,0 +1,71 @@
+//===------- Offload API tests - olEnqueueMemcpy --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olEnqueueMemcpyTest = offloadQueueTest;
+
+TEST_F(olEnqueueMemcpyTest, SuccessH2D) {
+  constexpr size_t Size = 1024;
+  void *Alloc;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &Alloc));
+  std::vector<uint8_t> Input(Size, 42);
+  ol_device_handle_t Host;
+  ASSERT_SUCCESS(olGetHostDevice(&Host));
+  ASSERT_SUCCESS(
+      olEnqueueMemcpy(Queue, Alloc, Device, Input.data(), Host, Size, nullptr));
+  olFinishQueue(Queue);
+  olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc);
+}
+
+TEST_F(olEnqueueMemcpyTest, SuccessDtoH) {
+  constexpr size_t Size = 1024;
+  void *Alloc;
+  std::vector<uint8_t> Input(Size, 42);
+  std::vector<uint8_t> Output(Size, 0);
+  ol_device_handle_t Host;
+  ASSERT_SUCCESS(olGetHostDevice(&Host));
+
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &Alloc));
+  ASSERT_SUCCESS(
+      olEnqueueMemcpy(Queue, Alloc, Device, Input.data(), Host, Size, nullptr));
+  ASSERT_SUCCESS(olEnqueueMemcpy(Queue, Output.data(), Host, Alloc, Device,
+                                 Size, nullptr));
+  ASSERT_SUCCESS(olFinishQueue(Queue));
+  for (uint8_t Val : Output) {
+    ASSERT_EQ(Val, 42);
+  }
+  ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_DEVICE, Alloc));
+}
+
+TEST_F(olEnqueueMemcpyTest, SuccessDtoD) {
+  constexpr size_t Size = 1024;
+  void *AllocA;
+  void *AllocB;
+  std::vector<uint8_t> Input(Size, 42);
+  std::vector<uint8_t> Output(Size, 0);
+  ol_device_handle_t Host;
+  ASSERT_SUCCESS(olGetHostDevice(&Host));
+
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &AllocA));
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, Size, &AllocB));
+  ASSERT_SUCCESS(olEnqueueMemcpy(Queue, AllocA, Device, Input.data(), Host,
+                                 Size, nullptr));
+  ASSERT_SUCCESS(
+      olEnqueueMemcpy(Queue, AllocB, Device, AllocA, Device, Size, nullptr));
+  ASSERT_SUCCESS(olEnqueueMemcpy(Queue, Output.data(), Host, AllocB, Device,
+                                 Size, nullptr));
+  ASSERT_SUCCESS(olFinishQueue(Queue));
+  for (uint8_t Val : Output) {
+    ASSERT_EQ(Val, 42);
+  }
+  ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_DEVICE, AllocA));
+  ASSERT_SUCCESS(olMemFree(Device, OL_ALLOC_TYPE_DEVICE, AllocB));
+}

>From df52c01e7bacb343e191f7d3367f06789892a2d6 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum at codeplay.com>
Date: Tue, 11 Feb 2025 13:10:37 +0000
Subject: [PATCH 17/17] Rework program and kernel implementation

---
 offload/liboffload/API/Enqueue.td             |   2 +
 offload/liboffload/API/Kernel.td              |  31 -----
 .../liboffload/include/generated/OffloadAPI.h |  93 ++-------------
 .../include/generated/OffloadEntryPoints.inc  | 111 +++---------------
 .../include/generated/OffloadFuncs.inc        |   4 -
 .../generated/OffloadImplFuncDecls.inc        |   8 +-
 .../include/generated/OffloadPrint.hpp        |  37 +-----
 offload/liboffload/src/OffloadImpl.cpp        |  87 +++-----------
 8 files changed, 51 insertions(+), 322 deletions(-)

diff --git a/offload/liboffload/API/Enqueue.td b/offload/liboffload/API/Enqueue.td
index 723d87cfb2d29..d29441559e788 100644
--- a/offload/liboffload/API/Enqueue.td
+++ b/offload/liboffload/API/Enqueue.td
@@ -97,6 +97,8 @@ def : Function {
     let params = [
         Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
         Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"const void*", "ArgumentsData", "pointer to the kernel argument struct", PARAM_IN>,
+        Param<"size_t", "ArgumentsSize", "size of the kernel argument struct", PARAM_IN>,
         Param<"const ol_kernel_launch_size_args_t*", "LaunchSizeArgs", "pointer to the struct containing launch size parameters", PARAM_IN>,
         Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
     ];
diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index cad738c56b3a3..2e16ea9a8078b 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -43,34 +43,3 @@ def : Function {
     ];
     let returns = [];
 }
-
-def : Function {
-    let name = "olSetKernelArgValue";
-    let desc = "Set the value of a single kernel argument at the given index";
-    let details = [
-        "The implementation will construct and lay out the backing storage for the kernel arguments."
-        "The effects of calls to this function on a kernel are lost if olSetKernelArgsData is called."
-    ];
-    let params = [
-        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
-        Param<"uint32_t", "Index", "index of the argument", PARAM_IN>,
-        Param<"size_t", "Size", "size of the argument data", PARAM_IN>,
-        Param<"void*", "ArgData", "pointer to the argument data", PARAM_IN>
-    ];
-    let returns = [];
-}
-
-def : Function {
-    let name = "olSetKernelArgsData";
-    let desc = "Set the entire argument data for a kernel";
-    let details = [
-        "Previous calls to olSetKernelArgValue on the same kernel are invalidated by this function"
-        "The data pointed to by ArgsData is assumed to be laid out correctly according to the requirements of the backend API"
-    ];
-    let params = [
-        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
-        Param<"void*", "ArgsData", "pointer to the argument data", PARAM_IN>,
-        Param<"size_t", "ArgsDataSize", "size of the argument data", PARAM_IN>
-    ];
-    let returns = [];
-}
diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index dd301f564a283..580445ff9d8ca 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -815,12 +815,17 @@ typedef struct ol_kernel_launch_size_args_t {
 ///         + `NULL == Queue`
 ///         + `NULL == Kernel`
 ///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == ArgumentsData`
 ///         + `NULL == LaunchSizeArgs`
 OL_APIEXPORT ol_result_t OL_APICALL olEnqueueKernelLaunch(
     // [in] handle of the queue
     ol_queue_handle_t Queue,
     // [in] handle of the kernel
     ol_kernel_handle_t Kernel,
+    // [in] pointer to the kernel argument struct
+    const void *ArgumentsData,
+    // [in] size of the kernel argument struct
+    size_t ArgumentsSize,
     // [in] pointer to the struct containing launch size parameters
     const ol_kernel_launch_size_args_t *LaunchSizeArgs,
     // [out][optional] optional recorded event for the enqueued operation
@@ -942,56 +947,6 @@ OL_APIEXPORT ol_result_t OL_APICALL olReleaseKernel(
     // [in] handle of the kernel
     ol_kernel_handle_t Kernel);
 
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set the value of a single kernel argument at the given index
-///
-/// @details
-///    - The implementation will construct and lay out the backing storage for
-///    the kernel arguments.The effects of calls to this function on a kernel
-///    are lost if olSetKernelArgsData is called.
-///
-/// @returns
-///     - ::OL_RESULT_SUCCESS
-///     - ::OL_ERRC_UNINITIALIZED
-///     - ::OL_ERRC_DEVICE_LOST
-///     - ::OL_ERRC_INVALID_NULL_HANDLE
-///         + `NULL == Kernel`
-///     - ::OL_ERRC_INVALID_NULL_POINTER
-///         + `NULL == ArgData`
-OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgValue(
-    // [in] handle of the kernel
-    ol_kernel_handle_t Kernel,
-    // [in] index of the argument
-    uint32_t Index,
-    // [in] size of the argument data
-    size_t Size,
-    // [in] pointer to the argument data
-    void *ArgData);
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set the entire argument data for a kernel
-///
-/// @details
-///    - Previous calls to olSetKernelArgValue on the same kernel are
-///    invalidated by this functionThe data pointed to by ArgsData is assumed to
-///    be laid out correctly according to the requirements of the backend API
-///
-/// @returns
-///     - ::OL_RESULT_SUCCESS
-///     - ::OL_ERRC_UNINITIALIZED
-///     - ::OL_ERRC_DEVICE_LOST
-///     - ::OL_ERRC_INVALID_NULL_HANDLE
-///         + `NULL == Kernel`
-///     - ::OL_ERRC_INVALID_NULL_POINTER
-///         + `NULL == ArgsData`
-OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgsData(
-    // [in] handle of the kernel
-    ol_kernel_handle_t Kernel,
-    // [in] pointer to the argument data
-    void *ArgsData,
-    // [in] size of the argument data
-    size_t ArgsDataSize);
-
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for olGetPlatform
 /// @details Each entry is a pointer to the parameter passed to the function;
@@ -1191,6 +1146,8 @@ typedef struct ol_enqueue_memcpy_dto_d_params_t {
 typedef struct ol_enqueue_kernel_launch_params_t {
   ol_queue_handle_t *pQueue;
   ol_kernel_handle_t *pKernel;
+  const void **pArgumentsData;
+  size_t *pArgumentsSize;
   const ol_kernel_launch_size_args_t **pLaunchSizeArgs;
   ol_event_handle_t **pEventOut;
 } ol_enqueue_kernel_launch_params_t;
@@ -1242,25 +1199,6 @@ typedef struct ol_release_kernel_params_t {
   ol_kernel_handle_t *pKernel;
 } ol_release_kernel_params_t;
 
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function parameters for olSetKernelArgValue
-/// @details Each entry is a pointer to the parameter passed to the function;
-typedef struct ol_set_kernel_arg_value_params_t {
-  ol_kernel_handle_t *pKernel;
-  uint32_t *pIndex;
-  size_t *pSize;
-  void **pArgData;
-} ol_set_kernel_arg_value_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function parameters for olSetKernelArgsData
-/// @details Each entry is a pointer to the parameter passed to the function;
-typedef struct ol_set_kernel_args_data_params_t {
-  ol_kernel_handle_t *pKernel;
-  void **pArgsData;
-  size_t *pArgsDataSize;
-} ol_set_kernel_args_data_params_t;
-
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Variant of olInit that also sets source code location information
 /// @details See also ::olInit
@@ -1447,6 +1385,7 @@ OL_APIEXPORT ol_result_t OL_APICALL olEnqueueMemcpyDtoDWithCodeLoc(
 /// @details See also ::olEnqueueKernelLaunch
 OL_APIEXPORT ol_result_t OL_APICALL olEnqueueKernelLaunchWithCodeLoc(
     ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
+    const void *ArgumentsData, size_t ArgumentsSize,
     const ol_kernel_launch_size_args_t *LaunchSizeArgs,
     ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
 
@@ -1494,22 +1433,6 @@ OL_APIEXPORT ol_result_t OL_APICALL olRetainKernelWithCodeLoc(
 OL_APIEXPORT ol_result_t OL_APICALL olReleaseKernelWithCodeLoc(
     ol_kernel_handle_t Kernel, ol_code_location_t *CodeLocation);
 
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Variant of olSetKernelArgValue that also sets source code location
-/// information
-/// @details See also ::olSetKernelArgValue
-OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgValueWithCodeLoc(
-    ol_kernel_handle_t Kernel, uint32_t Index, size_t Size, void *ArgData,
-    ol_code_location_t *CodeLocation);
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Variant of olSetKernelArgsData that also sets source code location
-/// information
-/// @details See also ::olSetKernelArgsData
-OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgsDataWithCodeLoc(
-    ol_kernel_handle_t Kernel, void *ArgsData, size_t ArgsDataSize,
-    ol_code_location_t *CodeLocation);
-
 #if defined(__cplusplus)
 } // extern "C"
 #endif
diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc
index c3104c2db735e..3fde7898cb7ac 100644
--- a/offload/liboffload/include/generated/OffloadEntryPoints.inc
+++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc
@@ -1077,6 +1077,7 @@ ol_result_t olEnqueueMemcpyDtoDWithCodeLoc(ol_queue_handle_t Queue,
 ///////////////////////////////////////////////////////////////////////////////
 ol_impl_result_t
 olEnqueueKernelLaunch_val(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
+                          const void *ArgumentsData, size_t ArgumentsSize,
                           const ol_kernel_launch_size_args_t *LaunchSizeArgs,
                           ol_event_handle_t *EventOut) {
   if (true /*enableParameterValidation*/) {
@@ -1088,26 +1089,33 @@ olEnqueueKernelLaunch_val(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
       return OL_ERRC_INVALID_NULL_HANDLE;
     }
 
+    if (NULL == ArgumentsData) {
+      return OL_ERRC_INVALID_NULL_POINTER;
+    }
+
     if (NULL == LaunchSizeArgs) {
       return OL_ERRC_INVALID_NULL_POINTER;
     }
   }
 
-  return olEnqueueKernelLaunch_impl(Queue, Kernel, LaunchSizeArgs, EventOut);
+  return olEnqueueKernelLaunch_impl(Queue, Kernel, ArgumentsData, ArgumentsSize,
+                                    LaunchSizeArgs, EventOut);
 }
 OL_APIEXPORT ol_result_t OL_APICALL
 olEnqueueKernelLaunch(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
+                      const void *ArgumentsData, size_t ArgumentsSize,
                       const ol_kernel_launch_size_args_t *LaunchSizeArgs,
                       ol_event_handle_t *EventOut) {
   if (offloadConfig().TracingEnabled) {
     std::cerr << "---> olEnqueueKernelLaunch";
   }
 
-  ol_result_t Result =
-      olEnqueueKernelLaunch_val(Queue, Kernel, LaunchSizeArgs, EventOut);
+  ol_result_t Result = olEnqueueKernelLaunch_val(
+      Queue, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs, EventOut);
 
   if (offloadConfig().TracingEnabled) {
-    ol_enqueue_kernel_launch_params_t Params = {&Queue, &Kernel,
+    ol_enqueue_kernel_launch_params_t Params = {&Queue,          &Kernel,
+                                                &ArgumentsData,  &ArgumentsSize,
                                                 &LaunchSizeArgs, &EventOut};
     std::cerr << "(" << &Params << ")";
     std::cerr << "-> " << Result << "\n";
@@ -1119,11 +1127,12 @@ olEnqueueKernelLaunch(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
 }
 ol_result_t olEnqueueKernelLaunchWithCodeLoc(
     ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
+    const void *ArgumentsData, size_t ArgumentsSize,
     const ol_kernel_launch_size_args_t *LaunchSizeArgs,
     ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation) {
   currentCodeLocation() = CodeLocation;
-  ol_result_t Result =
-      olEnqueueKernelLaunch(Queue, Kernel, LaunchSizeArgs, EventOut);
+  ol_result_t Result = olEnqueueKernelLaunch(
+      Queue, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs, EventOut);
 
   currentCodeLocation() = nullptr;
   return Result;
@@ -1376,93 +1385,3 @@ ol_result_t olReleaseKernelWithCodeLoc(ol_kernel_handle_t Kernel,
   currentCodeLocation() = nullptr;
   return Result;
 }
-
-///////////////////////////////////////////////////////////////////////////////
-ol_impl_result_t olSetKernelArgValue_val(ol_kernel_handle_t Kernel,
-                                         uint32_t Index, size_t Size,
-                                         void *ArgData) {
-  if (true /*enableParameterValidation*/) {
-    if (NULL == Kernel) {
-      return OL_ERRC_INVALID_NULL_HANDLE;
-    }
-
-    if (NULL == ArgData) {
-      return OL_ERRC_INVALID_NULL_POINTER;
-    }
-  }
-
-  return olSetKernelArgValue_impl(Kernel, Index, Size, ArgData);
-}
-OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgValue(
-    ol_kernel_handle_t Kernel, uint32_t Index, size_t Size, void *ArgData) {
-  if (offloadConfig().TracingEnabled) {
-    std::cerr << "---> olSetKernelArgValue";
-  }
-
-  ol_result_t Result = olSetKernelArgValue_val(Kernel, Index, Size, ArgData);
-
-  if (offloadConfig().TracingEnabled) {
-    ol_set_kernel_arg_value_params_t Params = {&Kernel, &Index, &Size,
-                                               &ArgData};
-    std::cerr << "(" << &Params << ")";
-    std::cerr << "-> " << Result << "\n";
-    if (Result && Result->Details) {
-      std::cerr << "     *Error Details* " << Result->Details << " \n";
-    }
-  }
-  return Result;
-}
-ol_result_t olSetKernelArgValueWithCodeLoc(ol_kernel_handle_t Kernel,
-                                           uint32_t Index, size_t Size,
-                                           void *ArgData,
-                                           ol_code_location_t *CodeLocation) {
-  currentCodeLocation() = CodeLocation;
-  ol_result_t Result = olSetKernelArgValue(Kernel, Index, Size, ArgData);
-
-  currentCodeLocation() = nullptr;
-  return Result;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-ol_impl_result_t olSetKernelArgsData_val(ol_kernel_handle_t Kernel,
-                                         void *ArgsData, size_t ArgsDataSize) {
-  if (true /*enableParameterValidation*/) {
-    if (NULL == Kernel) {
-      return OL_ERRC_INVALID_NULL_HANDLE;
-    }
-
-    if (NULL == ArgsData) {
-      return OL_ERRC_INVALID_NULL_POINTER;
-    }
-  }
-
-  return olSetKernelArgsData_impl(Kernel, ArgsData, ArgsDataSize);
-}
-OL_APIEXPORT ol_result_t OL_APICALL olSetKernelArgsData(
-    ol_kernel_handle_t Kernel, void *ArgsData, size_t ArgsDataSize) {
-  if (offloadConfig().TracingEnabled) {
-    std::cerr << "---> olSetKernelArgsData";
-  }
-
-  ol_result_t Result = olSetKernelArgsData_val(Kernel, ArgsData, ArgsDataSize);
-
-  if (offloadConfig().TracingEnabled) {
-    ol_set_kernel_args_data_params_t Params = {&Kernel, &ArgsData,
-                                               &ArgsDataSize};
-    std::cerr << "(" << &Params << ")";
-    std::cerr << "-> " << Result << "\n";
-    if (Result && Result->Details) {
-      std::cerr << "     *Error Details* " << Result->Details << " \n";
-    }
-  }
-  return Result;
-}
-ol_result_t olSetKernelArgsDataWithCodeLoc(ol_kernel_handle_t Kernel,
-                                           void *ArgsData, size_t ArgsDataSize,
-                                           ol_code_location_t *CodeLocation) {
-  currentCodeLocation() = CodeLocation;
-  ol_result_t Result = olSetKernelArgsData(Kernel, ArgsData, ArgsDataSize);
-
-  currentCodeLocation() = nullptr;
-  return Result;
-}
diff --git a/offload/liboffload/include/generated/OffloadFuncs.inc b/offload/liboffload/include/generated/OffloadFuncs.inc
index 6307d0a54b59e..fa8585437067d 100644
--- a/offload/liboffload/include/generated/OffloadFuncs.inc
+++ b/offload/liboffload/include/generated/OffloadFuncs.inc
@@ -41,8 +41,6 @@ OFFLOAD_FUNC(olReleaseProgram)
 OFFLOAD_FUNC(olCreateKernel)
 OFFLOAD_FUNC(olRetainKernel)
 OFFLOAD_FUNC(olReleaseKernel)
-OFFLOAD_FUNC(olSetKernelArgValue)
-OFFLOAD_FUNC(olSetKernelArgsData)
 OFFLOAD_FUNC(olInitWithCodeLoc)
 OFFLOAD_FUNC(olShutDownWithCodeLoc)
 OFFLOAD_FUNC(olGetPlatformWithCodeLoc)
@@ -74,7 +72,5 @@ OFFLOAD_FUNC(olReleaseProgramWithCodeLoc)
 OFFLOAD_FUNC(olCreateKernelWithCodeLoc)
 OFFLOAD_FUNC(olRetainKernelWithCodeLoc)
 OFFLOAD_FUNC(olReleaseKernelWithCodeLoc)
-OFFLOAD_FUNC(olSetKernelArgValueWithCodeLoc)
-OFFLOAD_FUNC(olSetKernelArgsDataWithCodeLoc)
 
 #undef OFFLOAD_FUNC
diff --git a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
index 1fb77e3d278c9..8075f5d616efe 100644
--- a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
+++ b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
@@ -83,6 +83,7 @@ ol_impl_result_t olEnqueueMemcpyDtoD_impl(ol_queue_handle_t Queue,
 
 ol_impl_result_t
 olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
+                           const void *ArgumentsData, size_t ArgumentsSize,
                            const ol_kernel_launch_size_args_t *LaunchSizeArgs,
                            ol_event_handle_t *EventOut);
 
@@ -101,10 +102,3 @@ ol_impl_result_t olCreateKernel_impl(ol_program_handle_t Program,
 ol_impl_result_t olRetainKernel_impl(ol_kernel_handle_t Kernel);
 
 ol_impl_result_t olReleaseKernel_impl(ol_kernel_handle_t Kernel);
-
-ol_impl_result_t olSetKernelArgValue_impl(ol_kernel_handle_t Kernel,
-                                          uint32_t Index, size_t Size,
-                                          void *ArgData);
-
-ol_impl_result_t olSetKernelArgsData_impl(ol_kernel_handle_t Kernel,
-                                          void *ArgsData, size_t ArgsDataSize);
diff --git a/offload/liboffload/include/generated/OffloadPrint.hpp b/offload/liboffload/include/generated/OffloadPrint.hpp
index a5616f85ea8a3..0821d79829a85 100644
--- a/offload/liboffload/include/generated/OffloadPrint.hpp
+++ b/offload/liboffload/include/generated/OffloadPrint.hpp
@@ -660,6 +660,12 @@ operator<<(std::ostream &os,
   os << ".Kernel = ";
   printPtr(os, *params->pKernel);
   os << ", ";
+  os << ".ArgumentsData = ";
+  printPtr(os, *params->pArgumentsData);
+  os << ", ";
+  os << ".ArgumentsSize = ";
+  os << *params->pArgumentsSize;
+  os << ", ";
   os << ".LaunchSizeArgs = ";
   printPtr(os, *params->pLaunchSizeArgs);
   os << ", ";
@@ -725,37 +731,6 @@ operator<<(std::ostream &os, const struct ol_release_kernel_params_t *params) {
   return os;
 }
 
-inline std::ostream &
-operator<<(std::ostream &os,
-           const struct ol_set_kernel_arg_value_params_t *params) {
-  os << ".Kernel = ";
-  printPtr(os, *params->pKernel);
-  os << ", ";
-  os << ".Index = ";
-  os << *params->pIndex;
-  os << ", ";
-  os << ".Size = ";
-  os << *params->pSize;
-  os << ", ";
-  os << ".ArgData = ";
-  printPtr(os, *params->pArgData);
-  return os;
-}
-
-inline std::ostream &
-operator<<(std::ostream &os,
-           const struct ol_set_kernel_args_data_params_t *params) {
-  os << ".Kernel = ";
-  printPtr(os, *params->pKernel);
-  os << ", ";
-  os << ".ArgsData = ";
-  printPtr(os, *params->pArgsData);
-  os << ", ";
-  os << ".ArgsDataSize = ";
-  os << *params->pArgsDataSize;
-  return os;
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // @brief Print pointer value
 template <typename T>
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 63d5bdb1e8f61..2623ce97bc85f 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -51,53 +51,14 @@ struct ol_event_impl_t {
 struct ol_program_impl_t {
   llvm::omp::target::plugin::DeviceImageTy *Image;
   std::unique_ptr<MemoryBuffer> ImageData;
+  __tgt_device_image DeviceImage;
   std::atomic_uint32_t RefCount;
 };
 
-// A helper that can be used to construct the argument buffer for a kernel.
-// Alternatively, a pre-existing buffer can be set with `setArgsData`.
-struct OffloadKernelArguments {
-  static constexpr size_t MaxParamBytes = 4096u;
-  using args_t = std::array<char, MaxParamBytes>;
-  using args_size_t = std::vector<size_t>;
-  using args_ptr_t = std::vector<void *>;
-  args_t Storage;
-  args_size_t ParamSizes;
-  args_ptr_t Pointers;
-
-  // Add an argument. If it already exists, it is replaced. Gaps are filled with
-  // empty arguments. Previous setArgsData calls are invalidated.
-  void addArg(size_t Index, size_t Size, const void *Arg) {
-    if (Index + 1 > Pointers.size()) {
-      Pointers.resize(Index + 1);
-      ParamSizes.resize(Index + 1);
-    }
-    ParamSizes[Index] = Size;
-    // calculate the insertion point on the array
-    size_t InsertPos = std::accumulate(std::begin(ParamSizes),
-                                       std::begin(ParamSizes) + Index, 0);
-    // Update the stored value for the argument
-    std::memcpy(&Storage[InsertPos], Arg, Size);
-    Pointers[Index] = &Storage[InsertPos];
-  }
-
-  // Set all argument data at once. Previous addArg calls are invalidated.
-  void setArgsData(const void *Data, size_t Size) {
-    std::memcpy(Storage.data(), Data, Size);
-    Pointers.clear();
-    ParamSizes.clear();
-  }
-
-  const args_ptr_t &getPointers() const noexcept { return Pointers; }
-
-  const char *getStorage() const noexcept { return Storage.data(); }
-};
-
 struct ol_kernel_impl_t {
   ol_program_handle_t Program;
   std::atomic_uint32_t RefCount;
   GenericKernelTy *KernelImpl;
-  OffloadKernelArguments Args;
 };
 
 using PlatformVecT = SmallVector<ol_platform_impl_t, 4>;
@@ -528,16 +489,20 @@ ol_impl_result_t olCreateProgram_impl(ol_device_handle_t Device, void *ProgData,
   // TODO: Make this copy optional.
   auto ImageData = MemoryBuffer::getMemBufferCopy(
       StringRef(reinterpret_cast<char *>(ProgData), ProgDataSize));
-  __tgt_device_image DeviceImage{
-      const_cast<char *>(ImageData->getBuffer().data()),
-      const_cast<char *>(ImageData->getBuffer().data()) + ProgDataSize - 1,
-      nullptr, nullptr};
 
   ol_program_handle_t Prog = new ol_program_impl_t();
 
-  auto Res = Device->Device->loadBinary(Device->Device->Plugin, &DeviceImage);
-  if (!Res)
+  Prog->DeviceImage = __tgt_device_image{
+      const_cast<char *>(ImageData->getBuffer().data()),
+      const_cast<char *>(ImageData->getBuffer().data()) + ProgDataSize, nullptr,
+      nullptr};
+
+  auto Res =
+      Device->Device->loadBinary(Device->Device->Plugin, &Prog->DeviceImage);
+  if (!Res) {
+    delete Prog;
     return OL_ERRC_INVALID_VALUE;
+  }
 
   Prog->Image = *Res;
   Prog->RefCount = 1;
@@ -593,16 +558,9 @@ ol_impl_result_t olReleaseKernel_impl(ol_kernel_handle_t Kernel) {
   return OL_SUCCESS;
 }
 
-ol_impl_result_t olSetKernelArgValue_impl(ol_kernel_handle_t Kernel,
-                                          uint32_t Index, size_t Size,
-                                          void *ArgData) {
-  Kernel->Args.addArg(Index, Size, ArgData);
-
-  return OL_SUCCESS;
-}
-
 ol_impl_result_t
 olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
+                           const void *ArgumentsData, size_t ArgumentsSize,
                            const ol_kernel_launch_size_args_t *LaunchSizeArgs,
                            ol_event_handle_t *EventOut) {
   auto *DeviceImpl = Queue->Device->Device;
@@ -610,7 +568,6 @@ olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
   AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, Queue->AsyncInfo);
 
   KernelArgsTy LaunchArgs{};
-  LaunchArgs.NumArgs = Kernel->Args.getPointers().size();
   LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroupsX;
   LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroupsY;
   LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumGroupsZ;
@@ -618,15 +575,15 @@ olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
   LaunchArgs.ThreadLimit[1] = LaunchSizeArgs->GroupSizeY;
   LaunchArgs.ThreadLimit[2] = LaunchSizeArgs->GroupSizeZ;
 
-  LaunchArgs.ArgPtrs =
-      reinterpret_cast<void **>(const_cast<char *>(Kernel->Args.getStorage()));
-
-  // No offsets needed, arguments are real pointers
-  auto ArgOffsets = std::vector<ptrdiff_t>(LaunchArgs.NumArgs, 0ul);
+  KernelLaunchParamsTy Params;
+  Params.Data = const_cast<void *>(ArgumentsData);
+  Params.Size = ArgumentsSize;
+  LaunchArgs.ArgPtrs = reinterpret_cast<void **>(&Params);
+  // Don't do anything with pointer indirection; use arg data as-is
+  LaunchArgs.Flags.IsCUDA = true;
 
   auto Err = Kernel->KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs,
-                                        ArgOffsets.data(), LaunchArgs,
-                                        AsyncInfoWrapper);
+                                        nullptr, LaunchArgs, AsyncInfoWrapper);
 
   AsyncInfoWrapper.finalize(Err);
   if (Err)
@@ -637,9 +594,3 @@ olEnqueueKernelLaunch_impl(ol_queue_handle_t Queue, ol_kernel_handle_t Kernel,
 
   return OL_SUCCESS;
 }
-
-ol_impl_result_t olSetKernelArgsData_impl(ol_kernel_handle_t Kernel,
-                                          void *ArgsData, size_t ArgsDataSize) {
-  Kernel->Args.setArgsData(ArgsData, ArgsDataSize);
-  return OL_SUCCESS;
-}



More information about the llvm-commits mailing list