[llvm-branch-commits] [llvm] [libsycl] add single_task (PR #188797)

Kseniya Tikhomirova via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Apr 1 07:00:20 PDT 2026


https://github.com/KseniyaTikhomirova updated https://github.com/llvm/llvm-project/pull/188797

>From 03a1c675484bf83746ac9cb9b9580e2f3bed238f Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova at intel.com>
Date: Wed, 25 Mar 2026 05:18:49 -0700
Subject: [PATCH 1/2] [libsycl] add single_task

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>

addition to single task

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>
---
 libsycl/docs/index.rst                        |   4 +
 .../sycl/__impl/detail/arg_wrapper.hpp        | 135 ++++++++++++++++++
 .../sycl/__impl/detail/unified_range_view.hpp |  52 +++++++
 libsycl/include/sycl/__impl/queue.hpp         |  96 +++++++++++++
 libsycl/src/detail/queue_impl.cpp             | 112 +++++++++++++++
 libsycl/src/detail/queue_impl.hpp             |  37 +++++
 libsycl/src/queue.cpp                         |  19 +++
 libsycl/test/basic/get_backend.cpp            |  54 +++++++
 libsycl/test/basic/submit_fn_ptr.cpp          |  18 +++
 9 files changed, 527 insertions(+)
 create mode 100644 libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
 create mode 100644 libsycl/include/sycl/__impl/detail/unified_range_view.hpp
 create mode 100644 libsycl/test/basic/get_backend.cpp
 create mode 100644 libsycl/test/basic/submit_fn_ptr.cpp

diff --git a/libsycl/docs/index.rst b/libsycl/docs/index.rst
index 9aa36b4a54c57..5961eeeedcedb 100644
--- a/libsycl/docs/index.rst
+++ b/libsycl/docs/index.rst
@@ -113,6 +113,10 @@ TODO for added SYCL classes
   * to implement submit & copy with accessors (low priority)
   * get_info & properties
   * ctors that accepts context (blocked by lack of liboffload support)
+  * nd_range kernel submissions
+  * cross-context events wait (host tasks are needed)
+  * implement check if lambda arguments are device copyable (requires clang support of corresponding builtins)
+  * kernel instantiating on host (debugging purposes)
 
 * ``property_list``: to fully implement and integrate with existing SYCL runtime classes supporting it
 * usm allocations:
diff --git a/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp b/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
new file mode 100644
index 0000000000000..96f60a3121787
--- /dev/null
+++ b/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
@@ -0,0 +1,135 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains helper functions used to wrap kernel arguments to
+/// typeless collection.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBSYCL___IMPL_DETAIL_ARG_WRAPPER_HPP
+#define _LIBSYCL___IMPL_DETAIL_ARG_WRAPPER_HPP
+
+#include <sycl/__impl/detail/config.hpp>
+#include <sycl/__impl/exception.hpp>
+
+#include <cassert>
+#include <memory>
+
+_LIBSYCL_BEGIN_NAMESPACE_SYCL
+
+namespace detail {
+
+/// Base class is needed for unification, we pass arguments through ABI
+/// boundary.
+class ArgWrapperBase {
+public:
+  ArgWrapperBase(const ArgWrapperBase &) = delete;
+  ArgWrapperBase &operator=(const ArgWrapperBase &) = delete;
+  virtual ~ArgWrapperBase() = default;
+
+  virtual void deepCopy() = 0;
+  virtual size_t getSize() const = 0;
+  virtual const void *getPtr() const = 0;
+
+protected:
+  ArgWrapperBase() = default;
+};
+
+/// Helps to manage arguments in a typeless way.
+template <typename Type> class ArgWrapper : public ArgWrapperBase {
+public:
+  ArgWrapper(Type &Arg) { Ptr = &Arg; }
+  ArgWrapper(const ArgWrapper &) = delete;
+  ArgWrapper &operator=(const ArgWrapper &) = delete;
+
+  /// \return size of argument in bytes.
+  size_t getSize() const override { return sizeof(Type); }
+
+  /// Returns raw pointer to the corresponding argument.
+  /// No copy is done by this method. It works with pointer to the memory whose
+  /// existence must be guaranteed by class user or with copy that must be
+  /// explicitly requested by class user via deepCopy method.
+  /// \return pointer to the argument.
+  const void *getPtr() const override {
+    assert((!DeepCopy || (DeepCopy.get()) == Ptr) &&
+           "Incorrect state of copied argument");
+    return Ptr;
+  }
+
+  /// Copies agrument to RT owned storage.
+  void deepCopy() override {
+    if (DeepCopy)
+      return;
+
+    DeepCopy.reset(new Type(*Ptr));
+    Ptr = DeepCopy.get();
+  }
+
+private:
+  Type *Ptr;
+  std::unique_ptr<Type> DeepCopy;
+};
+
+/// Collection of arguments. Provides functionality to accumulate all arguments
+/// data to pass through ABI boundary.
+class ArgCollection {
+public:
+  /// Adds argument to the collection. Don't own the memory. Argument lifetime
+  /// must be guaranteed by class user. If extended lifetime is needed (copy),
+  /// deepCopy must be called.
+  template <typename Type> void addArg(Type &Arg) {
+    MArgs.emplace_back(new ArgWrapper(Arg));
+  }
+
+  /// \return array of argument pointers.
+  const void **getArgPtrArray() {
+    if (MPtrs.size() != MArgs.size()) {
+      MPtrs.clear();
+      MPtrs.reserve(MArgs.size());
+      auto it = MArgs.cbegin();
+      while (it != MArgs.cend()) {
+        MPtrs.push_back((*it++)->getPtr());
+      }
+    }
+    return MPtrs.data();
+  }
+
+  /// \return array of argument sizes.
+  int64_t *getSizesArray() {
+    if (MSizes.size() != MArgs.size()) {
+      MSizes.clear();
+      MSizes.reserve(MArgs.size());
+      auto it = MArgs.cbegin();
+      while (it != MArgs.cend()) {
+        MSizes.push_back(static_cast<int64_t>((*it++)->getSize()));
+      }
+    }
+    return MSizes.data();
+  }
+
+  /// \return count of arguments in collection.
+  size_t getArgCount() { return MArgs.size(); }
+
+  /// Extends arguments lifetime by doing copy of all arguments.
+  void deepCopy() {
+    for (auto &Arg : MArgs)
+      Arg->deepCopy();
+  }
+
+private:
+  std::vector<std::unique_ptr<ArgWrapperBase>> MArgs;
+  std::vector<int64_t> MSizes;
+  std::vector<const void *> MPtrs;
+};
+
+} // namespace detail
+
+_LIBSYCL_END_NAMESPACE_SYCL
+
+#endif // _LIBSYCL___IMPL_DETAIL_ARG_WRAPPER_HPP
diff --git a/libsycl/include/sycl/__impl/detail/unified_range_view.hpp b/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
new file mode 100644
index 0000000000000..afa613fc8627b
--- /dev/null
+++ b/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains helper function class to unify ABI for different kernel
+/// ranges.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBSYCL___IMPL_DETAIL_UNIFIED_RANGE_VIEW_HPP
+#define _LIBSYCL___IMPL_DETAIL_UNIFIED_RANGE_VIEW_HPP
+
+#include <sycl/__impl/detail/config.hpp>
+
+_LIBSYCL_BEGIN_NAMESPACE_SYCL
+
+namespace detail {
+
+/// The structure to keep dimension and references to ranges unified for
+/// all dimensions.
+class UnifiedRangeView {
+
+public:
+  /// Default contructed view matches single task execution range.
+  UnifiedRangeView() = default;
+  UnifiedRangeView(const UnifiedRangeView &Desc) = default;
+  UnifiedRangeView(UnifiedRangeView &&Desc) = default;
+  UnifiedRangeView &operator=(const UnifiedRangeView &Desc) = default;
+  UnifiedRangeView &operator=(UnifiedRangeView &&Desc) = default;
+
+  // TODO: ctors with sycl::range and nd::range will be added later.
+
+  UnifiedRangeView(const size_t *GlobalSize, const size_t *LocalSize,
+                   const size_t *Offset, size_t Dims)
+      : MGlobalSize(GlobalSize), MLocalSize(LocalSize), MOffset(Offset),
+        MDims(Dims) {}
+
+  const size_t *MGlobalSize = nullptr;
+  const size_t *MLocalSize = nullptr;
+  const size_t *MOffset = nullptr;
+  size_t MDims = 1;
+};
+} // namespace detail
+
+_LIBSYCL_END_NAMESPACE_SYCL
+
+#endif // _LIBSYCL___IMPL_DETAIL_UNIFIED_RANGE_VIEW_HPP
diff --git a/libsycl/include/sycl/__impl/queue.hpp b/libsycl/include/sycl/__impl/queue.hpp
index 587f56a8eb245..d1ac320433c38 100644
--- a/libsycl/include/sycl/__impl/queue.hpp
+++ b/libsycl/include/sycl/__impl/queue.hpp
@@ -20,9 +20,11 @@
 #include <sycl/__impl/event.hpp>
 #include <sycl/__impl/property_list.hpp>
 
+#include <sycl/__impl/detail/arg_wrapper.hpp>
 #include <sycl/__impl/detail/config.hpp>
 #include <sycl/__impl/detail/default_async_handler.hpp>
 #include <sycl/__impl/detail/obj_utils.hpp>
+#include <sycl/__impl/detail/unified_range_view.hpp>
 
 _LIBSYCL_BEGIN_NAMESPACE_SYCL
 
@@ -31,6 +33,27 @@ class context;
 namespace detail {
 class QueueImpl;
 
+template <typename, typename T> struct CheckFunctionSignature {
+  static_assert(std::integral_constant<T, false>::value,
+                "Second template parameter is required to be of function type");
+};
+
+template <typename F, typename RetT, typename... Args>
+struct CheckFunctionSignature<F, RetT(Args...)> {
+private:
+  template <typename T>
+  static constexpr auto check(T *) -> typename std::is_same<
+      decltype(std::declval<T>().operator()(std::declval<Args>()...)),
+      RetT>::type;
+
+  template <typename> static constexpr std::false_type check(...);
+
+  using type = decltype(check<F>(0));
+
+public:
+  static constexpr bool value = type::value;
+};
+
 } // namespace detail
 
 // SYCL 2020 4.6.5. Queue class.
@@ -138,12 +161,85 @@ class _LIBSYCL_EXPORT queue {
   template <typename Param>
   typename Param::return_type get_backend_info() const;
 
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type.
+  ///
+  /// \param kernelFunc is the kernel functor or lambda.
+  /// \return an event that represents the status of the submitted kernel.
+  template <typename KernelName, typename KernelType>
+  event single_task(const KernelType &kernelFunc) {
+    return single_task<KernelName, KernelType>({}, kernelFunc);
+  }
+
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type.
+  ///
+  /// \param depEvent is an event that specifies the kernel dependency.
+  /// \param kernelFunc is the kernel functor or lambda.
+  /// \return an event that represents the status of the submitted kernel.
+  template <typename KernelName, typename KernelType>
+  event single_task(event depEvent, const KernelType &kernelFunc) {
+    return single_task<KernelName, KernelType>({depEvent}, kernelFunc);
+  }
+
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type.
+  ///
+  /// \param depEvents is a collection of events which specify the kernel
+  /// dependencies.
+  /// \param kernelFunc is the kernel functor or lambda.
+  /// \return an event that represents the status of the submitted kernel.
+  template <typename KernelName, typename KernelType>
+  event single_task(const std::vector<event> &depEvents,
+                    const KernelType &kernelFunc) {
+    static_assert(
+        (detail::CheckFunctionSignature<std::remove_reference_t<KernelType>,
+                                        void()>::value),
+        "sycl::queue::single_task() requires a kernel instead of command "
+        "group. ");
+
+    setKernelParameters(depEvents);
+    submitSingleTask<KernelName, KernelType>(kernelFunc);
+    return getLastEvent();
+  }
+
   /// Blocks the calling thread until all commands previously submitted to this
   /// queue have completed. Synchronous errors are reported through SYCL
   /// exceptions.
   void wait();
 
 private:
+  // Name of this function is defined by compiler. It generates call to this
+  // function in the host implementation of KernelFunc in submitSingleTask.
+  template <typename, typename... Args>
+  void sycl_kernel_launch(const char *KernelName, Args &&...args) {
+    static_assert((sizeof...(args) == 1) &&
+                  "Only 2 arguments are expected in sycl_kernel_launch.");
+    detail::ArgCollection TypelessArgs;
+    (TypelessArgs.addArg(args), ...);
+
+    submitKernelImpl(KernelName, TypelessArgs);
+  }
+
+#ifdef SYCL_LANGUAGE_VERSION
+#  define _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)                              \
+    [[clang::sycl_kernel_entry_point(KernelName)]]
+#else
+#  define _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)
+#endif // SYCL_LANGUAGE_VERSION
+
+  template <typename KernelName, typename KernelType>
+  _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)
+  void submitSingleTask(const KernelType KernelFunc) {
+    KernelFunc();
+  }
+
+  event getLastEvent();
+  void submitKernelImpl(const char *KernelName,
+                        detail::ArgCollection &TypelessArgs);
+  void setKernelParameters(const std::vector<event> &Events,
+                           const detail::UnifiedRangeView &Range = {});
+
   queue(const std::shared_ptr<detail::QueueImpl> &Impl) : impl(Impl) {}
   std::shared_ptr<detail::QueueImpl> impl;
 
diff --git a/libsycl/src/detail/queue_impl.cpp b/libsycl/src/detail/queue_impl.cpp
index 74ccc48877c09..243f38612e74c 100644
--- a/libsycl/src/detail/queue_impl.cpp
+++ b/libsycl/src/detail/queue_impl.cpp
@@ -16,6 +16,32 @@ _LIBSYCL_BEGIN_NAMESPACE_SYCL
 
 namespace detail {
 
+static void setKernelLaunchArgs(const detail::UnifiedRangeView &Range,
+                                ol_kernel_launch_size_args_t &ArgsToSet) {
+  size_t GlobalSize[3] = {1, 1, 1};
+  if (Range.MGlobalSize) {
+    for (uint32_t I = 0; I < Range.MDims; I++) {
+      GlobalSize[I] = Range.MGlobalSize[I];
+    }
+  }
+
+  size_t GroupSize[3] = {1, 1, 1};
+  if (Range.MLocalSize) {
+    for (uint32_t I = 0; I < Range.MDims; I++) {
+      GroupSize[I] = Range.MLocalSize[I];
+    }
+  }
+
+  ArgsToSet.Dimensions = Range.MDims;
+  ArgsToSet.NumGroups.x = GlobalSize[0] / GroupSize[0];
+  ArgsToSet.NumGroups.y = GlobalSize[1] / GroupSize[1];
+  ArgsToSet.NumGroups.z = GlobalSize[2] / GroupSize[2];
+  ArgsToSet.GroupSize.x = GroupSize[0];
+  ArgsToSet.GroupSize.y = GroupSize[1];
+  ArgsToSet.GroupSize.z = GroupSize[2];
+  ArgsToSet.DynSharedMemory = 0;
+}
+
 QueueImpl::QueueImpl(DeviceImpl &deviceImpl, const async_handler &asyncHandler,
                      const property_list &propList, PrivateTag)
     : MIsInorder(false), MAsyncHandler(asyncHandler), MPropList(propList),
@@ -34,5 +60,91 @@ backend QueueImpl::getBackend() const noexcept { return MDevice.getBackend(); }
 
 void QueueImpl::wait() { callAndThrow(olSyncQueue, MOffloadQueue); }
 
+static bool checkEventsPlatformMatch(std::vector<EventImplPtr> &Events,
+                                     const PlatformImpl &QueuePlatform) {
+  // liboffload limitation to olWaitEvents. We can't do any extra handling for
+  // cross context/platform events without host task support now.
+  //   "The input events can be from any queue on any device provided by the
+  //   same platform as `Queue`."
+  return std::all_of(Events.cbegin(), Events.cend(),
+                     [&QueuePlatform](const EventImplPtr &Event) {
+                       return &Event->getPlatformImpl() == &QueuePlatform;
+                     });
+}
+
+void QueueImpl::setKernelParameters(std::vector<EventImplPtr> &&Events,
+                                    const detail::UnifiedRangeView &Range) {
+  if (!checkEventsPlatformMatch(Events, MDevice.getPlatformImpl()))
+    throw sycl::exception(
+        sycl::make_error_code(sycl::errc::feature_not_supported),
+        "libsycl doesn't support cross-context/platform event dependencies "
+        "now.");
+
+  // TODO: this convertion and storing only offload events is possible only
+  // while we don't have host tasks (and features based on host tasks, like
+  // streams). With them - it is very likely we should copy EventImplPtr
+  // (shared_ptr) and keep it here. Although it may differ if host tasks will be
+  // implemented on offload level (no data now).
+  assert(MCurrentSubmitInfo.DepEvents.empty() &&
+         "Kernel submission must clean up dependencies.");
+  MCurrentSubmitInfo.DepEvents.reserve(Events.size());
+  for (auto &Event : Events) {
+    assert(Event && "Event impl object can't be nullptr");
+    MCurrentSubmitInfo.DepEvents.push_back(Event->getHandle());
+  }
+  setKernelLaunchArgs(Range, MCurrentSubmitInfo.Range);
+}
+
+void QueueImpl::submitKernelImpl(const char *KernelName,
+                                 detail::ArgCollection &TypelessArgs) {
+  ol_symbol_handle_t Kernel =
+      detail::ProgramManager::getInstance().getOrCreateKernel(KernelName,
+                                                              MDevice);
+  assert(Kernel);
+
+  ol_event_handle_t NewEvent{};
+  if (!MCurrentSubmitInfo.DepEvents.empty()) {
+    callAndThrow(olWaitEvents, MOffloadQueue,
+                 MCurrentSubmitInfo.DepEvents.data(),
+                 MCurrentSubmitInfo.DepEvents.size());
+  }
+
+  const void *Arguments = nullptr;
+  int64_t ArgumentsSize = 0;
+  if (TypelessArgs.getArgCount()) {
+    // without decomposition and free functions extension we always expect 1
+    // argument to the kernel - lambda capture.
+    assert(TypelessArgs.getArgCount() == 1 &&
+           "No arg decomposition or extensions are supported now.");
+    // TODO: liboffload doesn't support more than 1 argument without copy now.
+    // It doesn't expect array of arguments, it requires a contiguous memory
+    // with args. While we have only 1 argument we don't need extra handling
+    // here, we just pass the first argument directly.
+    Arguments = TypelessArgs.getArgPtrArray()[0];
+    ArgumentsSize = TypelessArgs.getSizesArray()[0];
+  }
+
+  // ol_kernel_launch_prop_t Props[2];
+  // Props[0].type = OL_KERNEL_LAUNCH_PROP_TYPE_SIZE;
+  // Props[0].data = &ArgumentsSize;
+  // Props[1] = OL_KERNEL_LAUNCH_PROP_END;
+  auto Result =
+      olLaunchKernel(MOffloadQueue, MDevice.getHandle(), Kernel, Arguments,
+                     ArgumentsSize, &MCurrentSubmitInfo.Range /*, Props*/);
+  // Clean up current kernel submit data to prepare structures for next
+  // submission.
+  MCurrentSubmitInfo.DepEvents.clear();
+  MCurrentSubmitInfo.Range = {};
+  if (isFailed(Result))
+    throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
+                          std::string("Kernel submission (") + KernelName +
+                              ") failed with " + formatCodeString(Result));
+
+  callAndThrow(olCreateEvent, MOffloadQueue, &NewEvent);
+
+  MCurrentSubmitInfo.LastEvent =
+      EventImpl::createEventWithHandle(NewEvent, MDevice.getPlatformImpl());
+}
+
 } // namespace detail
 _LIBSYCL_END_NAMESPACE_SYCL
diff --git a/libsycl/src/detail/queue_impl.hpp b/libsycl/src/detail/queue_impl.hpp
index cdb7595e852ec..6edb40471826a 100644
--- a/libsycl/src/detail/queue_impl.hpp
+++ b/libsycl/src/detail/queue_impl.hpp
@@ -15,6 +15,7 @@
 #include <OffloadAPI.h>
 
 #include <memory>
+#include <mutex>
 
 _LIBSYCL_BEGIN_NAMESPACE_SYCL
 namespace detail {
@@ -62,16 +63,52 @@ class QueueImpl : public std::enable_shared_from_this<QueueImpl> {
   /// \return true if and only if the queue is in order.
   bool isInOrder() const { return MIsInorder; }
 
+  /// Enqueues kernel to liboffload.
+  /// Kernel parameters like dependencies and range must be passed in advance by
+  /// calling setKernelParameters.
+  /// \param KernelName a name of kernel to be enqueued.
+  /// \param TypelessArgs data about kernel arguments to be used for enqueue.
+  void submitKernelImpl(const char *KernelName,
+                        detail::ArgCollection &TypelessArgs);
+
+  /// \return an event impl object that corresponds to the last kernel
+  /// submission in the calling thread.
+  EventImplPtr getLastEvent() {
+    assert(MCurrentSubmitInfo.LastEvent &&
+           "getLastEvent must be called after enqueue");
+    return MCurrentSubmitInfo.LastEvent;
+  }
+
+  /// Sets kernel parameters to be used in the next submitKernelImpl call.
+  /// Must be called prior to submitKernelImpl call.
+  /// \param Events a collection of events that kernal depends on.
+  /// \param Range a unified range view of execution range.
+  void setKernelParameters(std::vector<EventImplPtr> &&Events,
+                           const detail::UnifiedRangeView &Range);
+
   /// Waits for completion of all kernels submitted to this queue.
   void wait();
 
 private:
+  // Queue features.
   ol_queue_handle_t MOffloadQueue = {};
   const bool MIsInorder;
   const async_handler MAsyncHandler;
   const property_list MPropList;
   DeviceImpl &MDevice;
   ContextImpl &MContext;
+
+  // Submit data.
+  struct KernelSubmitInfo {
+    EventImplPtr LastEvent;
+    ol_kernel_launch_size_args_t Range;
+    // TODO: consider storing EventImplPtr here, it will work with plain handle
+    // only because submission is done within queue::submit call. Otherwise we
+    // need to ensure that event handle is still alive by keeping our own copy
+    // of EventImpl.
+    std::vector<ol_event_handle_t> DepEvents;
+  };
+  inline static thread_local KernelSubmitInfo MCurrentSubmitInfo = {};
 };
 
 } // namespace detail
diff --git a/libsycl/src/queue.cpp b/libsycl/src/queue.cpp
index 9fe020eabf2cc..f9d867e9567d7 100644
--- a/libsycl/src/queue.cpp
+++ b/libsycl/src/queue.cpp
@@ -33,6 +33,25 @@ device queue::get_device() const {
 
 bool queue::is_in_order() const { return impl->isInOrder(); }
 
+event queue::getLastEvent() {
+  return detail::createSyclObjFromImpl<event>(impl->getLastEvent());
+}
+
+void queue::setKernelParameters(const std::vector<event> &Events,
+                                const detail::UnifiedRangeView &Range) {
+  std::vector<detail::EventImplPtr> DepEventImplRefs;
+  DepEventImplRefs.reserve(Events.size());
+  for (const auto &Event : Events) {
+    DepEventImplRefs.push_back(detail::getSyclObjImpl(Event));
+  }
+  return impl->setKernelParameters(std::move(DepEventImplRefs), Range);
+}
+
+void queue::submitKernelImpl(const char *KernelName,
+                             detail::ArgCollection &TypelessArgs) {
+  impl->submitKernelImpl(KernelName, TypelessArgs);
+}
+
 void queue::wait() { return impl->wait(); }
 
 _LIBSYCL_END_NAMESPACE_SYCL
diff --git a/libsycl/test/basic/get_backend.cpp b/libsycl/test/basic/get_backend.cpp
new file mode 100644
index 0000000000000..064149a0c67e8
--- /dev/null
+++ b/libsycl/test/basic/get_backend.cpp
@@ -0,0 +1,54 @@
+// REQUIRES: any-device
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %t.out
+
+#include <iostream>
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+class Kernel1;
+
+bool check(backend be) {
+  switch (be) {
+  case backend::opencl:
+  case backend::level_zero:
+  case backend::cuda:
+  case backend::hip:
+    return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
+inline void return_fail() {
+  std::cout << "Failed" << std::endl;
+  exit(1);
+}
+
+int main() {
+  for (const auto &plt : platform::get_platforms()) {
+    if (check(plt.get_backend()) == false) {
+      return_fail();
+    }
+
+    auto device = device::get_devices()[0];
+    if (device.get_backend() != plt.get_backend()) {
+      return_fail();
+    }
+
+    queue q(device);
+    if (q.get_backend() != plt.get_backend()) {
+      return_fail();
+    }
+
+    event e = q.single_task<Kernel1>([]() {});
+    if (e.get_backend() != plt.get_backend()) {
+      return_fail();
+    }
+  }
+  std::cout << "Passed" << std::endl;
+  return 0;
+}
diff --git a/libsycl/test/basic/submit_fn_ptr.cpp b/libsycl/test/basic/submit_fn_ptr.cpp
new file mode 100644
index 0000000000000..2a5ce832d4db2
--- /dev/null
+++ b/libsycl/test/basic/submit_fn_ptr.cpp
@@ -0,0 +1,18 @@
+// REQUIRES: any-device
+// RUN: %clangxx -fsycl  %s -o %t.out
+// RUN: %t.out
+
+#include <sycl/sycl.hpp>
+
+class Test;
+
+int main() {
+  sycl::queue q;
+  int *p = sycl::malloc_shared<int>(1, q);
+  *p = 0;
+  q.single_task<Test>([=]() { *p = 42; });
+  q.wait();
+  assert(*p == 42);
+  sycl::free(p, q);
+  return 0;
+}

>From 59c746a55b77bff12ec5d5f80b2fe3373e80fcfd Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova at intel.com>
Date: Wed, 1 Apr 2026 06:59:55 -0700
Subject: [PATCH 2/2] fix comments

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>
---
 .../sycl/__impl/detail/arg_wrapper.hpp        | 43 +++++++++++--------
 .../sycl/__impl/detail/unified_range_view.hpp |  2 +-
 libsycl/include/sycl/__impl/queue.hpp         | 16 ++++---
 libsycl/src/detail/program_manager.cpp        |  9 ++--
 libsycl/src/detail/program_manager.hpp        |  5 ++-
 libsycl/src/detail/queue_impl.cpp             | 24 +++++++----
 libsycl/src/detail/queue_impl.hpp             | 11 +++--
 libsycl/src/queue.cpp                         |  2 +-
 libsycl/test/basic/get_backend.cpp            |  5 +--
 libsycl/test/basic/submit_fn_ptr.cpp          |  6 ++-
 10 files changed, 70 insertions(+), 53 deletions(-)

diff --git a/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp b/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
index 96f60a3121787..ba279f3ab4d1c 100644
--- a/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
+++ b/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
@@ -33,7 +33,7 @@ class ArgWrapperBase {
   ArgWrapperBase &operator=(const ArgWrapperBase &) = delete;
   virtual ~ArgWrapperBase() = default;
 
-  virtual void deepCopy() = 0;
+  virtual bool deepCopy() = 0;
   virtual size_t getSize() const = 0;
   virtual const void *getPtr() const = 0;
 
@@ -48,27 +48,29 @@ template <typename Type> class ArgWrapper : public ArgWrapperBase {
   ArgWrapper(const ArgWrapper &) = delete;
   ArgWrapper &operator=(const ArgWrapper &) = delete;
 
-  /// \return size of argument in bytes.
+  /// \return the size of the argument in bytes.
   size_t getSize() const override { return sizeof(Type); }
 
-  /// Returns raw pointer to the corresponding argument.
+  /// Returns a raw pointer to the corresponding argument.
   /// No copy is done by this method. It works with pointer to the memory whose
   /// existence must be guaranteed by class user or with copy that must be
   /// explicitly requested by class user via deepCopy method.
-  /// \return pointer to the argument.
+  /// \return a pointer to the argument.
   const void *getPtr() const override {
     assert((!DeepCopy || (DeepCopy.get()) == Ptr) &&
            "Incorrect state of copied argument");
     return Ptr;
   }
 
-  /// Copies agrument to RT owned storage.
-  void deepCopy() override {
+  /// Copies the agrument to RT owned storage.
+  /// \return true if argument was copied in this exact call.
+  bool deepCopy() override {
     if (DeepCopy)
-      return;
+      return false;
 
     DeepCopy.reset(new Type(*Ptr));
     Ptr = DeepCopy.get();
+    return true;
   }
 
 private:
@@ -80,9 +82,9 @@ template <typename Type> class ArgWrapper : public ArgWrapperBase {
 /// data to pass through ABI boundary.
 class ArgCollection {
 public:
-  /// Adds argument to the collection. Don't own the memory. Argument lifetime
-  /// must be guaranteed by class user. If extended lifetime is needed (copy),
-  /// deepCopy must be called.
+  /// Adds an argument to the collection. Doesn't own the memory, the argument
+  /// lifetime must be guaranteed by the class user. If extended lifetime is
+  /// needed (copy), deepCopy must be called.
   template <typename Type> void addArg(Type &Arg) {
     MArgs.emplace_back(new ArgWrapper(Arg));
   }
@@ -92,10 +94,8 @@ class ArgCollection {
     if (MPtrs.size() != MArgs.size()) {
       MPtrs.clear();
       MPtrs.reserve(MArgs.size());
-      auto it = MArgs.cbegin();
-      while (it != MArgs.cend()) {
-        MPtrs.push_back((*it++)->getPtr());
-      }
+      for (const auto &Argument : MArgs)
+        MPtrs.push_back(Argument->getPtr());
     }
     return MPtrs.data();
   }
@@ -105,10 +105,8 @@ class ArgCollection {
     if (MSizes.size() != MArgs.size()) {
       MSizes.clear();
       MSizes.reserve(MArgs.size());
-      auto it = MArgs.cbegin();
-      while (it != MArgs.cend()) {
-        MSizes.push_back(static_cast<int64_t>((*it++)->getSize()));
-      }
+      for (const auto &Argument : MArgs)
+        MSizes.push_back(static_cast<int64_t>(Argument->getSize()));
     }
     return MSizes.data();
   }
@@ -118,8 +116,15 @@ class ArgCollection {
 
   /// Extends arguments lifetime by doing copy of all arguments.
   void deepCopy() {
+    bool CopiedAtLeastOne = false;
     for (auto &Arg : MArgs)
-      Arg->deepCopy();
+      CopiedAtLeastOne |= Arg->deepCopy();
+
+    if (CopiedAtLeastOne) {
+      MPtrs.clear();
+      // MSizes must be the same. No changes here so no need to clean and
+      // refill.
+    }
   }
 
 private:
diff --git a/libsycl/include/sycl/__impl/detail/unified_range_view.hpp b/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
index afa613fc8627b..8eb256b7e6b0a 100644
--- a/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
+++ b/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
@@ -26,7 +26,7 @@ namespace detail {
 class UnifiedRangeView {
 
 public:
-  /// Default contructed view matches single task execution range.
+  /// Default contructed view matches the single task execution range.
   UnifiedRangeView() = default;
   UnifiedRangeView(const UnifiedRangeView &Desc) = default;
   UnifiedRangeView(UnifiedRangeView &&Desc) = default;
diff --git a/libsycl/include/sycl/__impl/queue.hpp b/libsycl/include/sycl/__impl/queue.hpp
index d1ac320433c38..a97df7a6260d2 100644
--- a/libsycl/include/sycl/__impl/queue.hpp
+++ b/libsycl/include/sycl/__impl/queue.hpp
@@ -185,7 +185,7 @@ class _LIBSYCL_EXPORT queue {
   /// Defines and invokes a SYCL kernel function as a lambda expression or a
   /// named function object type.
   ///
-  /// \param depEvents is a collection of events which specify the kernel
+  /// \param depEvents is a collection of events that specify the kernel
   /// dependencies.
   /// \param kernelFunc is the kernel functor or lambda.
   /// \return an event that represents the status of the submitted kernel.
@@ -195,8 +195,8 @@ class _LIBSYCL_EXPORT queue {
     static_assert(
         (detail::CheckFunctionSignature<std::remove_reference_t<KernelType>,
                                         void()>::value),
-        "sycl::queue::single_task() requires a kernel instead of command "
-        "group. ");
+        "sycl::queue::single_task() requires a kernel instead of a command "
+        "group");
 
     setKernelParameters(depEvents);
     submitSingleTask<KernelName, KernelType>(kernelFunc);
@@ -213,8 +213,10 @@ class _LIBSYCL_EXPORT queue {
   // function in the host implementation of KernelFunc in submitSingleTask.
   template <typename, typename... Args>
   void sycl_kernel_launch(const char *KernelName, Args &&...args) {
-    static_assert((sizeof...(args) == 1) &&
-                  "Only 2 arguments are expected in sycl_kernel_launch.");
+    static_assert(
+        (sizeof...(args) == 1) &&
+        "sycl_kernel_launch expects only 2 arguments now: name of kernel and "
+        "callable object passed to kernel invocation by the user.");
     detail::ArgCollection TypelessArgs;
     (TypelessArgs.addArg(args), ...);
 
@@ -230,12 +232,12 @@ class _LIBSYCL_EXPORT queue {
 
   template <typename KernelName, typename KernelType>
   _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)
-  void submitSingleTask(const KernelType KernelFunc) {
+  void submitSingleTask(const KernelType &KernelFunc) {
     KernelFunc();
   }
 
   event getLastEvent();
-  void submitKernelImpl(const char *KernelName,
+  void submitKernelImpl(std::string_view KernelName,
                         detail::ArgCollection &TypelessArgs);
   void setKernelParameters(const std::vector<event> &Events,
                            const detail::UnifiedRangeView &Range = {});
diff --git a/libsycl/src/detail/program_manager.cpp b/libsycl/src/detail/program_manager.cpp
index 7d6523daeb6ee..d4a79a709bab0 100644
--- a/libsycl/src/detail/program_manager.cpp
+++ b/libsycl/src/detail/program_manager.cpp
@@ -161,8 +161,9 @@ DeviceImageWrapper *ProgramManager::getDeviceImage(std::string_view KernelName,
                   "No kernel named " + std::string(KernelName) + " was found");
 }
 
-ol_symbol_handle_t ProgramManager::getOrCreateKernel(const char *KernelName,
-                                                     DeviceImpl &Device) {
+ol_symbol_handle_t
+ProgramManager::getOrCreateKernel(std::string_view KernelName,
+                                  DeviceImpl &Device) {
   std::lock_guard<std::mutex> ImageGuard(MImageCollectionMutex);
 
   auto KernelIDIt = MKernelNameToID.find(KernelName);
@@ -213,10 +214,10 @@ ProgramManager::getOrCreateProgram(DeviceImpl &Device,
 
 ol_symbol_handle_t ProgramManager::createKernel(ol_program_handle_t Program,
                                                 const kernel_id &KernelID,
-                                                const char *KernelName,
+                                                std::string_view KernelName,
                                                 DeviceImpl &Device) {
   ol_symbol_handle_t Kernel{};
-  callAndThrow(olGetSymbol, Program, KernelName, OL_SYMBOL_KIND_KERNEL,
+  callAndThrow(olGetSymbol, Program, KernelName.data(), OL_SYMBOL_KIND_KERNEL,
                &Kernel);
   MKernels.insert(
       std::make_pair(KernelID, std::make_pair(Device.getHandle(), Kernel)));
diff --git a/libsycl/src/detail/program_manager.hpp b/libsycl/src/detail/program_manager.hpp
index b017383a16b4c..1aa5e94b4355b 100644
--- a/libsycl/src/detail/program_manager.hpp
+++ b/libsycl/src/detail/program_manager.hpp
@@ -96,7 +96,7 @@ class ProgramManager {
   /// \param Device a device for which this kernel must be compiled.
   /// \return liboffload kernel handle that is ready to be passed to kernel
   /// execution methods.
-  ol_symbol_handle_t getOrCreateKernel(const char *KernelName,
+  ol_symbol_handle_t getOrCreateKernel(std::string_view KernelName,
                                        DeviceImpl &Device);
 
 private:
@@ -137,7 +137,8 @@ class ProgramManager {
   /// \return liboffload kernel for the requested configuration.
   ol_symbol_handle_t createKernel(ol_program_handle_t Program,
                                   const kernel_id &KernelID,
-                                  const char *KernelName, DeviceImpl &Device);
+                                  std::string_view KernelName,
+                                  DeviceImpl &Device);
 
   /// Searches for kernel.
   /// This call must be protected with mutex since it reads MKernels collection.
diff --git a/libsycl/src/detail/queue_impl.cpp b/libsycl/src/detail/queue_impl.cpp
index 243f38612e74c..329574c8f49ea 100644
--- a/libsycl/src/detail/queue_impl.cpp
+++ b/libsycl/src/detail/queue_impl.cpp
@@ -18,17 +18,18 @@ namespace detail {
 
 static void setKernelLaunchArgs(const detail::UnifiedRangeView &Range,
                                 ol_kernel_launch_size_args_t &ArgsToSet) {
+  assert(Range.MDims < 4 && "Invalid dimensions.");
   size_t GlobalSize[3] = {1, 1, 1};
   if (Range.MGlobalSize) {
     for (uint32_t I = 0; I < Range.MDims; I++) {
-      GlobalSize[I] = Range.MGlobalSize[I];
+      GlobalSize[I] = static_cast<uint32_t>(Range.MGlobalSize[I]);
     }
   }
 
   size_t GroupSize[3] = {1, 1, 1};
   if (Range.MLocalSize) {
     for (uint32_t I = 0; I < Range.MDims; I++) {
-      GroupSize[I] = Range.MLocalSize[I];
+      GroupSize[I] = static_cast<uint32_t>(Range.MLocalSize[I]);
     }
   }
 
@@ -80,8 +81,8 @@ void QueueImpl::setKernelParameters(std::vector<EventImplPtr> &&Events,
         "libsycl doesn't support cross-context/platform event dependencies "
         "now.");
 
-  // TODO: this convertion and storing only offload events is possible only
-  // while we don't have host tasks (and features based on host tasks, like
+  // TODO: this conversion and storing of only offload events is possible only
+  // while we don't have host tasks (or features based on host tasks, like
   // streams). With them - it is very likely we should copy EventImplPtr
   // (shared_ptr) and keep it here. Although it may differ if host tasks will be
   // implemented on offload level (no data now).
@@ -95,14 +96,19 @@ void QueueImpl::setKernelParameters(std::vector<EventImplPtr> &&Events,
   setKernelLaunchArgs(Range, MCurrentSubmitInfo.Range);
 }
 
-void QueueImpl::submitKernelImpl(const char *KernelName,
+void QueueImpl::submitKernelImpl(std::string_view KernelName,
                                  detail::ArgCollection &TypelessArgs) {
   ol_symbol_handle_t Kernel =
       detail::ProgramManager::getInstance().getOrCreateKernel(KernelName,
                                                               MDevice);
   assert(Kernel);
 
-  ol_event_handle_t NewEvent{};
+  // TODO: liboffload supports only in-order queues and no cross context waiting
+  // is available now that means that this code is excessive but correct. I
+  // don't want to skip it and rely on default liboffload behaviour that is
+  // applicable for in-order queue only. Once OOO queues are added this waiting
+  // must be disabled for in-order queues. Once host tasks are added - cross
+  // context dependencies should be enabled and checked as well.
   if (!MCurrentSubmitInfo.DepEvents.empty()) {
     callAndThrow(olWaitEvents, MOffloadQueue,
                  MCurrentSubmitInfo.DepEvents.data(),
@@ -137,9 +143,11 @@ void QueueImpl::submitKernelImpl(const char *KernelName,
   MCurrentSubmitInfo.Range = {};
   if (isFailed(Result))
     throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
-                          std::string("Kernel submission (") + KernelName +
-                              ") failed with " + formatCodeString(Result));
+                          std::string("Kernel submission (") +
+                              KernelName.data() + ") failed with " +
+                              formatCodeString(Result));
 
+  ol_event_handle_t NewEvent{};
   callAndThrow(olCreateEvent, MOffloadQueue, &NewEvent);
 
   MCurrentSubmitInfo.LastEvent =
diff --git a/libsycl/src/detail/queue_impl.hpp b/libsycl/src/detail/queue_impl.hpp
index 6edb40471826a..1d17031f5d6d1 100644
--- a/libsycl/src/detail/queue_impl.hpp
+++ b/libsycl/src/detail/queue_impl.hpp
@@ -15,7 +15,6 @@
 #include <OffloadAPI.h>
 
 #include <memory>
-#include <mutex>
 
 _LIBSYCL_BEGIN_NAMESPACE_SYCL
 namespace detail {
@@ -63,12 +62,12 @@ class QueueImpl : public std::enable_shared_from_this<QueueImpl> {
   /// \return true if and only if the queue is in order.
   bool isInOrder() const { return MIsInorder; }
 
-  /// Enqueues kernel to liboffload.
+  /// Enqueues a kernel to liboffload.
   /// Kernel parameters like dependencies and range must be passed in advance by
   /// calling setKernelParameters.
   /// \param KernelName a name of kernel to be enqueued.
   /// \param TypelessArgs data about kernel arguments to be used for enqueue.
-  void submitKernelImpl(const char *KernelName,
+  void submitKernelImpl(std::string_view KernelName,
                         detail::ArgCollection &TypelessArgs);
 
   /// \return an event impl object that corresponds to the last kernel
@@ -80,9 +79,9 @@ class QueueImpl : public std::enable_shared_from_this<QueueImpl> {
   }
 
   /// Sets kernel parameters to be used in the next submitKernelImpl call.
-  /// Must be called prior to submitKernelImpl call.
-  /// \param Events a collection of events that kernal depends on.
-  /// \param Range a unified range view of execution range.
+  /// Must be called prior to a submitKernelImpl call.
+  /// \param Events a collection of events that the kernal depends on.
+  /// \param Range a unified range view of the execution range.
   void setKernelParameters(std::vector<EventImplPtr> &&Events,
                            const detail::UnifiedRangeView &Range);
 
diff --git a/libsycl/src/queue.cpp b/libsycl/src/queue.cpp
index f9d867e9567d7..3821b4b5d5da2 100644
--- a/libsycl/src/queue.cpp
+++ b/libsycl/src/queue.cpp
@@ -47,7 +47,7 @@ void queue::setKernelParameters(const std::vector<event> &Events,
   return impl->setKernelParameters(std::move(DepEventImplRefs), Range);
 }
 
-void queue::submitKernelImpl(const char *KernelName,
+void queue::submitKernelImpl(std::string_view KernelName,
                              detail::ArgCollection &TypelessArgs) {
   impl->submitKernelImpl(KernelName, TypelessArgs);
 }
diff --git a/libsycl/test/basic/get_backend.cpp b/libsycl/test/basic/get_backend.cpp
index 064149a0c67e8..aa960f94ebd11 100644
--- a/libsycl/test/basic/get_backend.cpp
+++ b/libsycl/test/basic/get_backend.cpp
@@ -20,17 +20,16 @@ bool check(backend be) {
   default:
     return false;
   }
-  return false;
 }
 
-inline void return_fail() {
+void return_fail() {
   std::cout << "Failed" << std::endl;
   exit(1);
 }
 
 int main() {
   for (const auto &plt : platform::get_platforms()) {
-    if (check(plt.get_backend()) == false) {
+    if (!check(plt.get_backend())) {
       return_fail();
     }
 
diff --git a/libsycl/test/basic/submit_fn_ptr.cpp b/libsycl/test/basic/submit_fn_ptr.cpp
index 2a5ce832d4db2..b933c87e4ad15 100644
--- a/libsycl/test/basic/submit_fn_ptr.cpp
+++ b/libsycl/test/basic/submit_fn_ptr.cpp
@@ -12,7 +12,9 @@ int main() {
   *p = 0;
   q.single_task<Test>([=]() { *p = 42; });
   q.wait();
-  assert(*p == 42);
+
+  bool Failed = *p != 42;
+
   sycl::free(p, q);
-  return 0;
+  return Failed;
 }



More information about the llvm-branch-commits mailing list