[llvm-branch-commits] [llvm] [libsycl] add single_task (PR #188797)
Kseniya Tikhomirova via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Apr 1 07:00:20 PDT 2026
https://github.com/KseniyaTikhomirova updated https://github.com/llvm/llvm-project/pull/188797
>From 03a1c675484bf83746ac9cb9b9580e2f3bed238f Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova at intel.com>
Date: Wed, 25 Mar 2026 05:18:49 -0700
Subject: [PATCH 1/2] [libsycl] add single_task
Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>
addition to single task
Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>
---
libsycl/docs/index.rst | 4 +
.../sycl/__impl/detail/arg_wrapper.hpp | 135 ++++++++++++++++++
.../sycl/__impl/detail/unified_range_view.hpp | 52 +++++++
libsycl/include/sycl/__impl/queue.hpp | 96 +++++++++++++
libsycl/src/detail/queue_impl.cpp | 112 +++++++++++++++
libsycl/src/detail/queue_impl.hpp | 37 +++++
libsycl/src/queue.cpp | 19 +++
libsycl/test/basic/get_backend.cpp | 54 +++++++
libsycl/test/basic/submit_fn_ptr.cpp | 18 +++
9 files changed, 527 insertions(+)
create mode 100644 libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
create mode 100644 libsycl/include/sycl/__impl/detail/unified_range_view.hpp
create mode 100644 libsycl/test/basic/get_backend.cpp
create mode 100644 libsycl/test/basic/submit_fn_ptr.cpp
diff --git a/libsycl/docs/index.rst b/libsycl/docs/index.rst
index 9aa36b4a54c57..5961eeeedcedb 100644
--- a/libsycl/docs/index.rst
+++ b/libsycl/docs/index.rst
@@ -113,6 +113,10 @@ TODO for added SYCL classes
* to implement submit & copy with accessors (low priority)
* get_info & properties
* ctors that accepts context (blocked by lack of liboffload support)
+ * nd_range kernel submissions
+ * cross-context events wait (host tasks are needed)
+ * implement check if lambda arguments are device copyable (requires clang support of corresponding builtins)
+ * kernel instantiating on host (debugging purposes)
* ``property_list``: to fully implement and integrate with existing SYCL runtime classes supporting it
* usm allocations:
diff --git a/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp b/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
new file mode 100644
index 0000000000000..96f60a3121787
--- /dev/null
+++ b/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
@@ -0,0 +1,135 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains helper functions used to wrap kernel arguments to
+/// typeless collection.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBSYCL___IMPL_DETAIL_ARG_WRAPPER_HPP
+#define _LIBSYCL___IMPL_DETAIL_ARG_WRAPPER_HPP
+
+#include <sycl/__impl/detail/config.hpp>
+#include <sycl/__impl/exception.hpp>
+
+#include <cassert>
+#include <memory>
+
+_LIBSYCL_BEGIN_NAMESPACE_SYCL
+
+namespace detail {
+
+/// Base class is needed for unification, we pass arguments through ABI
+/// boundary.
+class ArgWrapperBase {
+public:
+ ArgWrapperBase(const ArgWrapperBase &) = delete;
+ ArgWrapperBase &operator=(const ArgWrapperBase &) = delete;
+ virtual ~ArgWrapperBase() = default;
+
+ virtual void deepCopy() = 0;
+ virtual size_t getSize() const = 0;
+ virtual const void *getPtr() const = 0;
+
+protected:
+ ArgWrapperBase() = default;
+};
+
+/// Helps to manage arguments in a typeless way.
+template <typename Type> class ArgWrapper : public ArgWrapperBase {
+public:
+ ArgWrapper(Type &Arg) { Ptr = &Arg; }
+ ArgWrapper(const ArgWrapper &) = delete;
+ ArgWrapper &operator=(const ArgWrapper &) = delete;
+
+ /// \return size of argument in bytes.
+ size_t getSize() const override { return sizeof(Type); }
+
+ /// Returns raw pointer to the corresponding argument.
+ /// No copy is done by this method. It works with pointer to the memory whose
+ /// existence must be guaranteed by class user or with copy that must be
+ /// explicitly requested by class user via deepCopy method.
+ /// \return pointer to the argument.
+ const void *getPtr() const override {
+ assert((!DeepCopy || (DeepCopy.get()) == Ptr) &&
+ "Incorrect state of copied argument");
+ return Ptr;
+ }
+
+ /// Copies agrument to RT owned storage.
+ void deepCopy() override {
+ if (DeepCopy)
+ return;
+
+ DeepCopy.reset(new Type(*Ptr));
+ Ptr = DeepCopy.get();
+ }
+
+private:
+ Type *Ptr;
+ std::unique_ptr<Type> DeepCopy;
+};
+
+/// Collection of arguments. Provides functionality to accumulate all arguments
+/// data to pass through ABI boundary.
+class ArgCollection {
+public:
+ /// Adds argument to the collection. Don't own the memory. Argument lifetime
+ /// must be guaranteed by class user. If extended lifetime is needed (copy),
+ /// deepCopy must be called.
+ template <typename Type> void addArg(Type &Arg) {
+ MArgs.emplace_back(new ArgWrapper(Arg));
+ }
+
+ /// \return array of argument pointers.
+ const void **getArgPtrArray() {
+ if (MPtrs.size() != MArgs.size()) {
+ MPtrs.clear();
+ MPtrs.reserve(MArgs.size());
+ auto it = MArgs.cbegin();
+ while (it != MArgs.cend()) {
+ MPtrs.push_back((*it++)->getPtr());
+ }
+ }
+ return MPtrs.data();
+ }
+
+ /// \return array of argument sizes.
+ int64_t *getSizesArray() {
+ if (MSizes.size() != MArgs.size()) {
+ MSizes.clear();
+ MSizes.reserve(MArgs.size());
+ auto it = MArgs.cbegin();
+ while (it != MArgs.cend()) {
+ MSizes.push_back(static_cast<int64_t>((*it++)->getSize()));
+ }
+ }
+ return MSizes.data();
+ }
+
+ /// \return count of arguments in collection.
+ size_t getArgCount() { return MArgs.size(); }
+
+ /// Extends arguments lifetime by doing copy of all arguments.
+ void deepCopy() {
+ for (auto &Arg : MArgs)
+ Arg->deepCopy();
+ }
+
+private:
+ std::vector<std::unique_ptr<ArgWrapperBase>> MArgs;
+ std::vector<int64_t> MSizes;
+ std::vector<const void *> MPtrs;
+};
+
+} // namespace detail
+
+_LIBSYCL_END_NAMESPACE_SYCL
+
+#endif // _LIBSYCL___IMPL_DETAIL_ARG_WRAPPER_HPP
diff --git a/libsycl/include/sycl/__impl/detail/unified_range_view.hpp b/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
new file mode 100644
index 0000000000000..afa613fc8627b
--- /dev/null
+++ b/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains helper function class to unify ABI for different kernel
+/// ranges.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBSYCL___IMPL_DETAIL_UNIFIED_RANGE_VIEW_HPP
+#define _LIBSYCL___IMPL_DETAIL_UNIFIED_RANGE_VIEW_HPP
+
+#include <sycl/__impl/detail/config.hpp>
+
+_LIBSYCL_BEGIN_NAMESPACE_SYCL
+
+namespace detail {
+
+/// The structure to keep dimension and references to ranges unified for
+/// all dimensions.
+class UnifiedRangeView {
+
+public:
+ /// Default contructed view matches single task execution range.
+ UnifiedRangeView() = default;
+ UnifiedRangeView(const UnifiedRangeView &Desc) = default;
+ UnifiedRangeView(UnifiedRangeView &&Desc) = default;
+ UnifiedRangeView &operator=(const UnifiedRangeView &Desc) = default;
+ UnifiedRangeView &operator=(UnifiedRangeView &&Desc) = default;
+
+ // TODO: ctors with sycl::range and nd::range will be added later.
+
+ UnifiedRangeView(const size_t *GlobalSize, const size_t *LocalSize,
+ const size_t *Offset, size_t Dims)
+ : MGlobalSize(GlobalSize), MLocalSize(LocalSize), MOffset(Offset),
+ MDims(Dims) {}
+
+ const size_t *MGlobalSize = nullptr;
+ const size_t *MLocalSize = nullptr;
+ const size_t *MOffset = nullptr;
+ size_t MDims = 1;
+};
+} // namespace detail
+
+_LIBSYCL_END_NAMESPACE_SYCL
+
+#endif // _LIBSYCL___IMPL_DETAIL_UNIFIED_RANGE_VIEW_HPP
diff --git a/libsycl/include/sycl/__impl/queue.hpp b/libsycl/include/sycl/__impl/queue.hpp
index 587f56a8eb245..d1ac320433c38 100644
--- a/libsycl/include/sycl/__impl/queue.hpp
+++ b/libsycl/include/sycl/__impl/queue.hpp
@@ -20,9 +20,11 @@
#include <sycl/__impl/event.hpp>
#include <sycl/__impl/property_list.hpp>
+#include <sycl/__impl/detail/arg_wrapper.hpp>
#include <sycl/__impl/detail/config.hpp>
#include <sycl/__impl/detail/default_async_handler.hpp>
#include <sycl/__impl/detail/obj_utils.hpp>
+#include <sycl/__impl/detail/unified_range_view.hpp>
_LIBSYCL_BEGIN_NAMESPACE_SYCL
@@ -31,6 +33,27 @@ class context;
namespace detail {
class QueueImpl;
+template <typename, typename T> struct CheckFunctionSignature {
+ static_assert(std::integral_constant<T, false>::value,
+ "Second template parameter is required to be of function type");
+};
+
+template <typename F, typename RetT, typename... Args>
+struct CheckFunctionSignature<F, RetT(Args...)> {
+private:
+ template <typename T>
+ static constexpr auto check(T *) -> typename std::is_same<
+ decltype(std::declval<T>().operator()(std::declval<Args>()...)),
+ RetT>::type;
+
+ template <typename> static constexpr std::false_type check(...);
+
+ using type = decltype(check<F>(0));
+
+public:
+ static constexpr bool value = type::value;
+};
+
} // namespace detail
// SYCL 2020 4.6.5. Queue class.
@@ -138,12 +161,85 @@ class _LIBSYCL_EXPORT queue {
template <typename Param>
typename Param::return_type get_backend_info() const;
+ /// Defines and invokes a SYCL kernel function as a lambda expression or a
+ /// named function object type.
+ ///
+ /// \param kernelFunc is the kernel functor or lambda.
+ /// \return an event that represents the status of the submitted kernel.
+ template <typename KernelName, typename KernelType>
+ event single_task(const KernelType &kernelFunc) {
+ return single_task<KernelName, KernelType>({}, kernelFunc);
+ }
+
+ /// Defines and invokes a SYCL kernel function as a lambda expression or a
+ /// named function object type.
+ ///
+ /// \param depEvent is an event that specifies the kernel dependency.
+ /// \param kernelFunc is the kernel functor or lambda.
+ /// \return an event that represents the status of the submitted kernel.
+ template <typename KernelName, typename KernelType>
+ event single_task(event depEvent, const KernelType &kernelFunc) {
+ return single_task<KernelName, KernelType>({depEvent}, kernelFunc);
+ }
+
+ /// Defines and invokes a SYCL kernel function as a lambda expression or a
+ /// named function object type.
+ ///
+ /// \param depEvents is a collection of events which specify the kernel
+ /// dependencies.
+ /// \param kernelFunc is the kernel functor or lambda.
+ /// \return an event that represents the status of the submitted kernel.
+ template <typename KernelName, typename KernelType>
+ event single_task(const std::vector<event> &depEvents,
+ const KernelType &kernelFunc) {
+ static_assert(
+ (detail::CheckFunctionSignature<std::remove_reference_t<KernelType>,
+ void()>::value),
+ "sycl::queue::single_task() requires a kernel instead of command "
+ "group. ");
+
+ setKernelParameters(depEvents);
+ submitSingleTask<KernelName, KernelType>(kernelFunc);
+ return getLastEvent();
+ }
+
/// Blocks the calling thread until all commands previously submitted to this
/// queue have completed. Synchronous errors are reported through SYCL
/// exceptions.
void wait();
private:
+ // Name of this function is defined by compiler. It generates call to this
+ // function in the host implementation of KernelFunc in submitSingleTask.
+ template <typename, typename... Args>
+ void sycl_kernel_launch(const char *KernelName, Args &&...args) {
+ static_assert((sizeof...(args) == 1) &&
+ "Only 2 arguments are expected in sycl_kernel_launch.");
+ detail::ArgCollection TypelessArgs;
+ (TypelessArgs.addArg(args), ...);
+
+ submitKernelImpl(KernelName, TypelessArgs);
+ }
+
+#ifdef SYCL_LANGUAGE_VERSION
+# define _LIBSYCL_ENTRY_POINT_ATTR__(KernelName) \
+ [[clang::sycl_kernel_entry_point(KernelName)]]
+#else
+# define _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)
+#endif // SYCL_LANGUAGE_VERSION
+
+ template <typename KernelName, typename KernelType>
+ _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)
+ void submitSingleTask(const KernelType KernelFunc) {
+ KernelFunc();
+ }
+
+ event getLastEvent();
+ void submitKernelImpl(const char *KernelName,
+ detail::ArgCollection &TypelessArgs);
+ void setKernelParameters(const std::vector<event> &Events,
+ const detail::UnifiedRangeView &Range = {});
+
queue(const std::shared_ptr<detail::QueueImpl> &Impl) : impl(Impl) {}
std::shared_ptr<detail::QueueImpl> impl;
diff --git a/libsycl/src/detail/queue_impl.cpp b/libsycl/src/detail/queue_impl.cpp
index 74ccc48877c09..243f38612e74c 100644
--- a/libsycl/src/detail/queue_impl.cpp
+++ b/libsycl/src/detail/queue_impl.cpp
@@ -16,6 +16,32 @@ _LIBSYCL_BEGIN_NAMESPACE_SYCL
namespace detail {
+static void setKernelLaunchArgs(const detail::UnifiedRangeView &Range,
+ ol_kernel_launch_size_args_t &ArgsToSet) {
+ size_t GlobalSize[3] = {1, 1, 1};
+ if (Range.MGlobalSize) {
+ for (uint32_t I = 0; I < Range.MDims; I++) {
+ GlobalSize[I] = Range.MGlobalSize[I];
+ }
+ }
+
+ size_t GroupSize[3] = {1, 1, 1};
+ if (Range.MLocalSize) {
+ for (uint32_t I = 0; I < Range.MDims; I++) {
+ GroupSize[I] = Range.MLocalSize[I];
+ }
+ }
+
+ ArgsToSet.Dimensions = Range.MDims;
+ ArgsToSet.NumGroups.x = GlobalSize[0] / GroupSize[0];
+ ArgsToSet.NumGroups.y = GlobalSize[1] / GroupSize[1];
+ ArgsToSet.NumGroups.z = GlobalSize[2] / GroupSize[2];
+ ArgsToSet.GroupSize.x = GroupSize[0];
+ ArgsToSet.GroupSize.y = GroupSize[1];
+ ArgsToSet.GroupSize.z = GroupSize[2];
+ ArgsToSet.DynSharedMemory = 0;
+}
+
QueueImpl::QueueImpl(DeviceImpl &deviceImpl, const async_handler &asyncHandler,
const property_list &propList, PrivateTag)
: MIsInorder(false), MAsyncHandler(asyncHandler), MPropList(propList),
@@ -34,5 +60,91 @@ backend QueueImpl::getBackend() const noexcept { return MDevice.getBackend(); }
void QueueImpl::wait() { callAndThrow(olSyncQueue, MOffloadQueue); }
+static bool checkEventsPlatformMatch(std::vector<EventImplPtr> &Events,
+ const PlatformImpl &QueuePlatform) {
+ // liboffload limitation to olWaitEvents. We can't do any extra handling for
+ // cross context/platform events without host task support now.
+ // "The input events can be from any queue on any device provided by the
+ // same platform as `Queue`."
+ return std::all_of(Events.cbegin(), Events.cend(),
+ [&QueuePlatform](const EventImplPtr &Event) {
+ return &Event->getPlatformImpl() == &QueuePlatform;
+ });
+}
+
+void QueueImpl::setKernelParameters(std::vector<EventImplPtr> &&Events,
+ const detail::UnifiedRangeView &Range) {
+ if (!checkEventsPlatformMatch(Events, MDevice.getPlatformImpl()))
+ throw sycl::exception(
+ sycl::make_error_code(sycl::errc::feature_not_supported),
+ "libsycl doesn't support cross-context/platform event dependencies "
+ "now.");
+
+ // TODO: this convertion and storing only offload events is possible only
+ // while we don't have host tasks (and features based on host tasks, like
+ // streams). With them - it is very likely we should copy EventImplPtr
+ // (shared_ptr) and keep it here. Although it may differ if host tasks will be
+ // implemented on offload level (no data now).
+ assert(MCurrentSubmitInfo.DepEvents.empty() &&
+ "Kernel submission must clean up dependencies.");
+ MCurrentSubmitInfo.DepEvents.reserve(Events.size());
+ for (auto &Event : Events) {
+ assert(Event && "Event impl object can't be nullptr");
+ MCurrentSubmitInfo.DepEvents.push_back(Event->getHandle());
+ }
+ setKernelLaunchArgs(Range, MCurrentSubmitInfo.Range);
+}
+
+void QueueImpl::submitKernelImpl(const char *KernelName,
+ detail::ArgCollection &TypelessArgs) {
+ ol_symbol_handle_t Kernel =
+ detail::ProgramManager::getInstance().getOrCreateKernel(KernelName,
+ MDevice);
+ assert(Kernel);
+
+ ol_event_handle_t NewEvent{};
+ if (!MCurrentSubmitInfo.DepEvents.empty()) {
+ callAndThrow(olWaitEvents, MOffloadQueue,
+ MCurrentSubmitInfo.DepEvents.data(),
+ MCurrentSubmitInfo.DepEvents.size());
+ }
+
+ const void *Arguments = nullptr;
+ int64_t ArgumentsSize = 0;
+ if (TypelessArgs.getArgCount()) {
+ // without decomposition and free functions extension we always expect 1
+ // argument to the kernel - lambda capture.
+ assert(TypelessArgs.getArgCount() == 1 &&
+ "No arg decomposition or extensions are supported now.");
+ // TODO: liboffload doesn't support more than 1 argument without copy now.
+ // It doesn't expect array of arguments, it requires a contiguous memory
+ // with args. While we have only 1 argument we don't need extra handling
+ // here, we just pass the first argument directly.
+ Arguments = TypelessArgs.getArgPtrArray()[0];
+ ArgumentsSize = TypelessArgs.getSizesArray()[0];
+ }
+
+ // ol_kernel_launch_prop_t Props[2];
+ // Props[0].type = OL_KERNEL_LAUNCH_PROP_TYPE_SIZE;
+ // Props[0].data = &ArgumentsSize;
+ // Props[1] = OL_KERNEL_LAUNCH_PROP_END;
+ auto Result =
+ olLaunchKernel(MOffloadQueue, MDevice.getHandle(), Kernel, Arguments,
+ ArgumentsSize, &MCurrentSubmitInfo.Range /*, Props*/);
+ // Clean up current kernel submit data to prepare structures for next
+ // submission.
+ MCurrentSubmitInfo.DepEvents.clear();
+ MCurrentSubmitInfo.Range = {};
+ if (isFailed(Result))
+ throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
+ std::string("Kernel submission (") + KernelName +
+ ") failed with " + formatCodeString(Result));
+
+ callAndThrow(olCreateEvent, MOffloadQueue, &NewEvent);
+
+ MCurrentSubmitInfo.LastEvent =
+ EventImpl::createEventWithHandle(NewEvent, MDevice.getPlatformImpl());
+}
+
} // namespace detail
_LIBSYCL_END_NAMESPACE_SYCL
diff --git a/libsycl/src/detail/queue_impl.hpp b/libsycl/src/detail/queue_impl.hpp
index cdb7595e852ec..6edb40471826a 100644
--- a/libsycl/src/detail/queue_impl.hpp
+++ b/libsycl/src/detail/queue_impl.hpp
@@ -15,6 +15,7 @@
#include <OffloadAPI.h>
#include <memory>
+#include <mutex>
_LIBSYCL_BEGIN_NAMESPACE_SYCL
namespace detail {
@@ -62,16 +63,52 @@ class QueueImpl : public std::enable_shared_from_this<QueueImpl> {
/// \return true if and only if the queue is in order.
bool isInOrder() const { return MIsInorder; }
+ /// Enqueues kernel to liboffload.
+ /// Kernel parameters like dependencies and range must be passed in advance by
+ /// calling setKernelParameters.
+ /// \param KernelName a name of kernel to be enqueued.
+ /// \param TypelessArgs data about kernel arguments to be used for enqueue.
+ void submitKernelImpl(const char *KernelName,
+ detail::ArgCollection &TypelessArgs);
+
+ /// \return an event impl object that corresponds to the last kernel
+ /// submission in the calling thread.
+ EventImplPtr getLastEvent() {
+ assert(MCurrentSubmitInfo.LastEvent &&
+ "getLastEvent must be called after enqueue");
+ return MCurrentSubmitInfo.LastEvent;
+ }
+
+ /// Sets kernel parameters to be used in the next submitKernelImpl call.
+ /// Must be called prior to submitKernelImpl call.
+ /// \param Events a collection of events that kernal depends on.
+ /// \param Range a unified range view of execution range.
+ void setKernelParameters(std::vector<EventImplPtr> &&Events,
+ const detail::UnifiedRangeView &Range);
+
/// Waits for completion of all kernels submitted to this queue.
void wait();
private:
+ // Queue features.
ol_queue_handle_t MOffloadQueue = {};
const bool MIsInorder;
const async_handler MAsyncHandler;
const property_list MPropList;
DeviceImpl &MDevice;
ContextImpl &MContext;
+
+ // Submit data.
+ struct KernelSubmitInfo {
+ EventImplPtr LastEvent;
+ ol_kernel_launch_size_args_t Range;
+ // TODO: consider storing EventImplPtr here, it will work with plain handle
+ // only because submission is done within queue::submit call. Otherwise we
+ // need to ensure that event handle is still alive by keeping our own copy
+ // of EventImpl.
+ std::vector<ol_event_handle_t> DepEvents;
+ };
+ inline static thread_local KernelSubmitInfo MCurrentSubmitInfo = {};
};
} // namespace detail
diff --git a/libsycl/src/queue.cpp b/libsycl/src/queue.cpp
index 9fe020eabf2cc..f9d867e9567d7 100644
--- a/libsycl/src/queue.cpp
+++ b/libsycl/src/queue.cpp
@@ -33,6 +33,25 @@ device queue::get_device() const {
bool queue::is_in_order() const { return impl->isInOrder(); }
+event queue::getLastEvent() {
+ return detail::createSyclObjFromImpl<event>(impl->getLastEvent());
+}
+
+void queue::setKernelParameters(const std::vector<event> &Events,
+ const detail::UnifiedRangeView &Range) {
+ std::vector<detail::EventImplPtr> DepEventImplRefs;
+ DepEventImplRefs.reserve(Events.size());
+ for (const auto &Event : Events) {
+ DepEventImplRefs.push_back(detail::getSyclObjImpl(Event));
+ }
+ return impl->setKernelParameters(std::move(DepEventImplRefs), Range);
+}
+
+void queue::submitKernelImpl(const char *KernelName,
+ detail::ArgCollection &TypelessArgs) {
+ impl->submitKernelImpl(KernelName, TypelessArgs);
+}
+
void queue::wait() { return impl->wait(); }
_LIBSYCL_END_NAMESPACE_SYCL
diff --git a/libsycl/test/basic/get_backend.cpp b/libsycl/test/basic/get_backend.cpp
new file mode 100644
index 0000000000000..064149a0c67e8
--- /dev/null
+++ b/libsycl/test/basic/get_backend.cpp
@@ -0,0 +1,54 @@
+// REQUIRES: any-device
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %t.out
+
+#include <iostream>
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+class Kernel1;
+
+bool check(backend be) {
+ switch (be) {
+ case backend::opencl:
+ case backend::level_zero:
+ case backend::cuda:
+ case backend::hip:
+ return true;
+ default:
+ return false;
+ }
+ return false;
+}
+
+inline void return_fail() {
+ std::cout << "Failed" << std::endl;
+ exit(1);
+}
+
+int main() {
+ for (const auto &plt : platform::get_platforms()) {
+ if (check(plt.get_backend()) == false) {
+ return_fail();
+ }
+
+ auto device = device::get_devices()[0];
+ if (device.get_backend() != plt.get_backend()) {
+ return_fail();
+ }
+
+ queue q(device);
+ if (q.get_backend() != plt.get_backend()) {
+ return_fail();
+ }
+
+ event e = q.single_task<Kernel1>([]() {});
+ if (e.get_backend() != plt.get_backend()) {
+ return_fail();
+ }
+ }
+ std::cout << "Passed" << std::endl;
+ return 0;
+}
diff --git a/libsycl/test/basic/submit_fn_ptr.cpp b/libsycl/test/basic/submit_fn_ptr.cpp
new file mode 100644
index 0000000000000..2a5ce832d4db2
--- /dev/null
+++ b/libsycl/test/basic/submit_fn_ptr.cpp
@@ -0,0 +1,18 @@
+// REQUIRES: any-device
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %t.out
+
+#include <sycl/sycl.hpp>
+
+class Test;
+
+int main() {
+ sycl::queue q;
+ int *p = sycl::malloc_shared<int>(1, q);
+ *p = 0;
+ q.single_task<Test>([=]() { *p = 42; });
+ q.wait();
+ assert(*p == 42);
+ sycl::free(p, q);
+ return 0;
+}
>From 59c746a55b77bff12ec5d5f80b2fe3373e80fcfd Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova at intel.com>
Date: Wed, 1 Apr 2026 06:59:55 -0700
Subject: [PATCH 2/2] fix comments
Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>
---
.../sycl/__impl/detail/arg_wrapper.hpp | 43 +++++++++++--------
.../sycl/__impl/detail/unified_range_view.hpp | 2 +-
libsycl/include/sycl/__impl/queue.hpp | 16 ++++---
libsycl/src/detail/program_manager.cpp | 9 ++--
libsycl/src/detail/program_manager.hpp | 5 ++-
libsycl/src/detail/queue_impl.cpp | 24 +++++++----
libsycl/src/detail/queue_impl.hpp | 11 +++--
libsycl/src/queue.cpp | 2 +-
libsycl/test/basic/get_backend.cpp | 5 +--
libsycl/test/basic/submit_fn_ptr.cpp | 6 ++-
10 files changed, 70 insertions(+), 53 deletions(-)
diff --git a/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp b/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
index 96f60a3121787..ba279f3ab4d1c 100644
--- a/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
+++ b/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
@@ -33,7 +33,7 @@ class ArgWrapperBase {
ArgWrapperBase &operator=(const ArgWrapperBase &) = delete;
virtual ~ArgWrapperBase() = default;
- virtual void deepCopy() = 0;
+ virtual bool deepCopy() = 0;
virtual size_t getSize() const = 0;
virtual const void *getPtr() const = 0;
@@ -48,27 +48,29 @@ template <typename Type> class ArgWrapper : public ArgWrapperBase {
ArgWrapper(const ArgWrapper &) = delete;
ArgWrapper &operator=(const ArgWrapper &) = delete;
- /// \return size of argument in bytes.
+ /// \return the size of the argument in bytes.
size_t getSize() const override { return sizeof(Type); }
- /// Returns raw pointer to the corresponding argument.
+ /// Returns a raw pointer to the corresponding argument.
/// No copy is done by this method. It works with pointer to the memory whose
/// existence must be guaranteed by class user or with copy that must be
/// explicitly requested by class user via deepCopy method.
- /// \return pointer to the argument.
+ /// \return a pointer to the argument.
const void *getPtr() const override {
assert((!DeepCopy || (DeepCopy.get()) == Ptr) &&
"Incorrect state of copied argument");
return Ptr;
}
- /// Copies agrument to RT owned storage.
- void deepCopy() override {
+ /// Copies the agrument to RT owned storage.
+ /// \return true if argument was copied in this exact call.
+ bool deepCopy() override {
if (DeepCopy)
- return;
+ return false;
DeepCopy.reset(new Type(*Ptr));
Ptr = DeepCopy.get();
+ return true;
}
private:
@@ -80,9 +82,9 @@ template <typename Type> class ArgWrapper : public ArgWrapperBase {
/// data to pass through ABI boundary.
class ArgCollection {
public:
- /// Adds argument to the collection. Don't own the memory. Argument lifetime
- /// must be guaranteed by class user. If extended lifetime is needed (copy),
- /// deepCopy must be called.
+ /// Adds an argument to the collection. Doesn't own the memory, the argument
+ /// lifetime must be guaranteed by the class user. If extended lifetime is
+ /// needed (copy), deepCopy must be called.
template <typename Type> void addArg(Type &Arg) {
MArgs.emplace_back(new ArgWrapper(Arg));
}
@@ -92,10 +94,8 @@ class ArgCollection {
if (MPtrs.size() != MArgs.size()) {
MPtrs.clear();
MPtrs.reserve(MArgs.size());
- auto it = MArgs.cbegin();
- while (it != MArgs.cend()) {
- MPtrs.push_back((*it++)->getPtr());
- }
+ for (const auto &Argument : MArgs)
+ MPtrs.push_back(Argument->getPtr());
}
return MPtrs.data();
}
@@ -105,10 +105,8 @@ class ArgCollection {
if (MSizes.size() != MArgs.size()) {
MSizes.clear();
MSizes.reserve(MArgs.size());
- auto it = MArgs.cbegin();
- while (it != MArgs.cend()) {
- MSizes.push_back(static_cast<int64_t>((*it++)->getSize()));
- }
+ for (const auto &Argument : MArgs)
+ MSizes.push_back(static_cast<int64_t>(Argument->getSize()));
}
return MSizes.data();
}
@@ -118,8 +116,15 @@ class ArgCollection {
/// Extends arguments lifetime by doing copy of all arguments.
void deepCopy() {
+ bool CopiedAtLeastOne = false;
for (auto &Arg : MArgs)
- Arg->deepCopy();
+ CopiedAtLeastOne |= Arg->deepCopy();
+
+ if (CopiedAtLeastOne) {
+ MPtrs.clear();
+ // MSizes must be the same. No changes here so no need to clean and
+ // refill.
+ }
}
private:
diff --git a/libsycl/include/sycl/__impl/detail/unified_range_view.hpp b/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
index afa613fc8627b..8eb256b7e6b0a 100644
--- a/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
+++ b/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
@@ -26,7 +26,7 @@ namespace detail {
class UnifiedRangeView {
public:
- /// Default contructed view matches single task execution range.
+ /// Default contructed view matches the single task execution range.
UnifiedRangeView() = default;
UnifiedRangeView(const UnifiedRangeView &Desc) = default;
UnifiedRangeView(UnifiedRangeView &&Desc) = default;
diff --git a/libsycl/include/sycl/__impl/queue.hpp b/libsycl/include/sycl/__impl/queue.hpp
index d1ac320433c38..a97df7a6260d2 100644
--- a/libsycl/include/sycl/__impl/queue.hpp
+++ b/libsycl/include/sycl/__impl/queue.hpp
@@ -185,7 +185,7 @@ class _LIBSYCL_EXPORT queue {
/// Defines and invokes a SYCL kernel function as a lambda expression or a
/// named function object type.
///
- /// \param depEvents is a collection of events which specify the kernel
+ /// \param depEvents is a collection of events that specify the kernel
/// dependencies.
/// \param kernelFunc is the kernel functor or lambda.
/// \return an event that represents the status of the submitted kernel.
@@ -195,8 +195,8 @@ class _LIBSYCL_EXPORT queue {
static_assert(
(detail::CheckFunctionSignature<std::remove_reference_t<KernelType>,
void()>::value),
- "sycl::queue::single_task() requires a kernel instead of command "
- "group. ");
+ "sycl::queue::single_task() requires a kernel instead of a command "
+ "group");
setKernelParameters(depEvents);
submitSingleTask<KernelName, KernelType>(kernelFunc);
@@ -213,8 +213,10 @@ class _LIBSYCL_EXPORT queue {
// function in the host implementation of KernelFunc in submitSingleTask.
template <typename, typename... Args>
void sycl_kernel_launch(const char *KernelName, Args &&...args) {
- static_assert((sizeof...(args) == 1) &&
- "Only 2 arguments are expected in sycl_kernel_launch.");
+ static_assert(
+ (sizeof...(args) == 1) &&
+ "sycl_kernel_launch expects only 2 arguments now: name of kernel and "
+ "callable object passed to kernel invocation by the user.");
detail::ArgCollection TypelessArgs;
(TypelessArgs.addArg(args), ...);
@@ -230,12 +232,12 @@ class _LIBSYCL_EXPORT queue {
template <typename KernelName, typename KernelType>
_LIBSYCL_ENTRY_POINT_ATTR__(KernelName)
- void submitSingleTask(const KernelType KernelFunc) {
+ void submitSingleTask(const KernelType &KernelFunc) {
KernelFunc();
}
event getLastEvent();
- void submitKernelImpl(const char *KernelName,
+ void submitKernelImpl(std::string_view KernelName,
detail::ArgCollection &TypelessArgs);
void setKernelParameters(const std::vector<event> &Events,
const detail::UnifiedRangeView &Range = {});
diff --git a/libsycl/src/detail/program_manager.cpp b/libsycl/src/detail/program_manager.cpp
index 7d6523daeb6ee..d4a79a709bab0 100644
--- a/libsycl/src/detail/program_manager.cpp
+++ b/libsycl/src/detail/program_manager.cpp
@@ -161,8 +161,9 @@ DeviceImageWrapper *ProgramManager::getDeviceImage(std::string_view KernelName,
"No kernel named " + std::string(KernelName) + " was found");
}
-ol_symbol_handle_t ProgramManager::getOrCreateKernel(const char *KernelName,
- DeviceImpl &Device) {
+ol_symbol_handle_t
+ProgramManager::getOrCreateKernel(std::string_view KernelName,
+ DeviceImpl &Device) {
std::lock_guard<std::mutex> ImageGuard(MImageCollectionMutex);
auto KernelIDIt = MKernelNameToID.find(KernelName);
@@ -213,10 +214,10 @@ ProgramManager::getOrCreateProgram(DeviceImpl &Device,
ol_symbol_handle_t ProgramManager::createKernel(ol_program_handle_t Program,
const kernel_id &KernelID,
- const char *KernelName,
+ std::string_view KernelName,
DeviceImpl &Device) {
ol_symbol_handle_t Kernel{};
- callAndThrow(olGetSymbol, Program, KernelName, OL_SYMBOL_KIND_KERNEL,
+ callAndThrow(olGetSymbol, Program, KernelName.data(), OL_SYMBOL_KIND_KERNEL,
&Kernel);
MKernels.insert(
std::make_pair(KernelID, std::make_pair(Device.getHandle(), Kernel)));
diff --git a/libsycl/src/detail/program_manager.hpp b/libsycl/src/detail/program_manager.hpp
index b017383a16b4c..1aa5e94b4355b 100644
--- a/libsycl/src/detail/program_manager.hpp
+++ b/libsycl/src/detail/program_manager.hpp
@@ -96,7 +96,7 @@ class ProgramManager {
/// \param Device a device for which this kernel must be compiled.
/// \return liboffload kernel handle that is ready to be passed to kernel
/// execution methods.
- ol_symbol_handle_t getOrCreateKernel(const char *KernelName,
+ ol_symbol_handle_t getOrCreateKernel(std::string_view KernelName,
DeviceImpl &Device);
private:
@@ -137,7 +137,8 @@ class ProgramManager {
/// \return liboffload kernel for the requested configuration.
ol_symbol_handle_t createKernel(ol_program_handle_t Program,
const kernel_id &KernelID,
- const char *KernelName, DeviceImpl &Device);
+ std::string_view KernelName,
+ DeviceImpl &Device);
/// Searches for kernel.
/// This call must be protected with mutex since it reads MKernels collection.
diff --git a/libsycl/src/detail/queue_impl.cpp b/libsycl/src/detail/queue_impl.cpp
index 243f38612e74c..329574c8f49ea 100644
--- a/libsycl/src/detail/queue_impl.cpp
+++ b/libsycl/src/detail/queue_impl.cpp
@@ -18,17 +18,18 @@ namespace detail {
static void setKernelLaunchArgs(const detail::UnifiedRangeView &Range,
ol_kernel_launch_size_args_t &ArgsToSet) {
+ assert(Range.MDims < 4 && "Invalid dimensions.");
size_t GlobalSize[3] = {1, 1, 1};
if (Range.MGlobalSize) {
for (uint32_t I = 0; I < Range.MDims; I++) {
- GlobalSize[I] = Range.MGlobalSize[I];
+ GlobalSize[I] = static_cast<uint32_t>(Range.MGlobalSize[I]);
}
}
size_t GroupSize[3] = {1, 1, 1};
if (Range.MLocalSize) {
for (uint32_t I = 0; I < Range.MDims; I++) {
- GroupSize[I] = Range.MLocalSize[I];
+ GroupSize[I] = static_cast<uint32_t>(Range.MLocalSize[I]);
}
}
@@ -80,8 +81,8 @@ void QueueImpl::setKernelParameters(std::vector<EventImplPtr> &&Events,
"libsycl doesn't support cross-context/platform event dependencies "
"now.");
- // TODO: this convertion and storing only offload events is possible only
- // while we don't have host tasks (and features based on host tasks, like
+ // TODO: this conversion and storing of only offload events is possible only
+ // while we don't have host tasks (or features based on host tasks, like
// streams). With them - it is very likely we should copy EventImplPtr
// (shared_ptr) and keep it here. Although it may differ if host tasks will be
// implemented on offload level (no data now).
@@ -95,14 +96,19 @@ void QueueImpl::setKernelParameters(std::vector<EventImplPtr> &&Events,
setKernelLaunchArgs(Range, MCurrentSubmitInfo.Range);
}
-void QueueImpl::submitKernelImpl(const char *KernelName,
+void QueueImpl::submitKernelImpl(std::string_view KernelName,
detail::ArgCollection &TypelessArgs) {
ol_symbol_handle_t Kernel =
detail::ProgramManager::getInstance().getOrCreateKernel(KernelName,
MDevice);
assert(Kernel);
- ol_event_handle_t NewEvent{};
+ // TODO: liboffload supports only in-order queues and no cross context waiting
+ // is available now that means that this code is excessive but correct. I
+ // don't want to skip it and rely on default liboffload behaviour that is
+ // applicable for in-order queue only. Once OOO queues are added this waiting
+ // must be disabled for in-order queues. Once host tasks are added - cross
+ // context dependencies should be enabled and checked as well.
if (!MCurrentSubmitInfo.DepEvents.empty()) {
callAndThrow(olWaitEvents, MOffloadQueue,
MCurrentSubmitInfo.DepEvents.data(),
@@ -137,9 +143,11 @@ void QueueImpl::submitKernelImpl(const char *KernelName,
MCurrentSubmitInfo.Range = {};
if (isFailed(Result))
throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
- std::string("Kernel submission (") + KernelName +
- ") failed with " + formatCodeString(Result));
+ std::string("Kernel submission (") +
+ KernelName.data() + ") failed with " +
+ formatCodeString(Result));
+ ol_event_handle_t NewEvent{};
callAndThrow(olCreateEvent, MOffloadQueue, &NewEvent);
MCurrentSubmitInfo.LastEvent =
diff --git a/libsycl/src/detail/queue_impl.hpp b/libsycl/src/detail/queue_impl.hpp
index 6edb40471826a..1d17031f5d6d1 100644
--- a/libsycl/src/detail/queue_impl.hpp
+++ b/libsycl/src/detail/queue_impl.hpp
@@ -15,7 +15,6 @@
#include <OffloadAPI.h>
#include <memory>
-#include <mutex>
_LIBSYCL_BEGIN_NAMESPACE_SYCL
namespace detail {
@@ -63,12 +62,12 @@ class QueueImpl : public std::enable_shared_from_this<QueueImpl> {
/// \return true if and only if the queue is in order.
bool isInOrder() const { return MIsInorder; }
- /// Enqueues kernel to liboffload.
+ /// Enqueues a kernel to liboffload.
/// Kernel parameters like dependencies and range must be passed in advance by
/// calling setKernelParameters.
/// \param KernelName a name of kernel to be enqueued.
/// \param TypelessArgs data about kernel arguments to be used for enqueue.
- void submitKernelImpl(const char *KernelName,
+ void submitKernelImpl(std::string_view KernelName,
detail::ArgCollection &TypelessArgs);
/// \return an event impl object that corresponds to the last kernel
@@ -80,9 +79,9 @@ class QueueImpl : public std::enable_shared_from_this<QueueImpl> {
}
/// Sets kernel parameters to be used in the next submitKernelImpl call.
- /// Must be called prior to submitKernelImpl call.
- /// \param Events a collection of events that kernal depends on.
- /// \param Range a unified range view of execution range.
+ /// Must be called prior to a submitKernelImpl call.
+ /// \param Events a collection of events that the kernal depends on.
+ /// \param Range a unified range view of the execution range.
void setKernelParameters(std::vector<EventImplPtr> &&Events,
const detail::UnifiedRangeView &Range);
diff --git a/libsycl/src/queue.cpp b/libsycl/src/queue.cpp
index f9d867e9567d7..3821b4b5d5da2 100644
--- a/libsycl/src/queue.cpp
+++ b/libsycl/src/queue.cpp
@@ -47,7 +47,7 @@ void queue::setKernelParameters(const std::vector<event> &Events,
return impl->setKernelParameters(std::move(DepEventImplRefs), Range);
}
-void queue::submitKernelImpl(const char *KernelName,
+void queue::submitKernelImpl(std::string_view KernelName,
detail::ArgCollection &TypelessArgs) {
impl->submitKernelImpl(KernelName, TypelessArgs);
}
diff --git a/libsycl/test/basic/get_backend.cpp b/libsycl/test/basic/get_backend.cpp
index 064149a0c67e8..aa960f94ebd11 100644
--- a/libsycl/test/basic/get_backend.cpp
+++ b/libsycl/test/basic/get_backend.cpp
@@ -20,17 +20,16 @@ bool check(backend be) {
default:
return false;
}
- return false;
}
-inline void return_fail() {
+void return_fail() {
std::cout << "Failed" << std::endl;
exit(1);
}
int main() {
for (const auto &plt : platform::get_platforms()) {
- if (check(plt.get_backend()) == false) {
+ if (!check(plt.get_backend())) {
return_fail();
}
diff --git a/libsycl/test/basic/submit_fn_ptr.cpp b/libsycl/test/basic/submit_fn_ptr.cpp
index 2a5ce832d4db2..b933c87e4ad15 100644
--- a/libsycl/test/basic/submit_fn_ptr.cpp
+++ b/libsycl/test/basic/submit_fn_ptr.cpp
@@ -12,7 +12,9 @@ int main() {
*p = 0;
q.single_task<Test>([=]() { *p = 42; });
q.wait();
- assert(*p == 42);
+
+ bool Failed = *p != 42;
+
sycl::free(p, q);
- return 0;
+ return Failed;
}
More information about the llvm-branch-commits
mailing list