[llvm-branch-commits] [llvm] [libsycl] Add parallel_for feature (PR #189068)

Thu Apr 23 04:05:29 PDT 2026

https://github.com/KseniyaTikhomirova updated https://github.com/llvm/llvm-project/pull/189068

>From 03a1c675484bf83746ac9cb9b9580e2f3bed238f Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova at intel.com>
Date: Wed, 25 Mar 2026 05:18:49 -0700
Subject: [PATCH 1/7] [libsycl] add single_task

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>

addition to single task

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>
---
 libsycl/docs/index.rst                        |   4 +
 .../sycl/__impl/detail/arg_wrapper.hpp        | 135 ++++++++++++++++++
 .../sycl/__impl/detail/unified_range_view.hpp |  52 +++++++
 libsycl/include/sycl/__impl/queue.hpp         |  96 +++++++++++++
 libsycl/src/detail/queue_impl.cpp             | 112 +++++++++++++++
 libsycl/src/detail/queue_impl.hpp             |  37 +++++
 libsycl/src/queue.cpp                         |  19 +++
 libsycl/test/basic/get_backend.cpp            |  54 +++++++
 libsycl/test/basic/submit_fn_ptr.cpp          |  18 +++
 9 files changed, 527 insertions(+)
 create mode 100644 libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
 create mode 100644 libsycl/include/sycl/__impl/detail/unified_range_view.hpp
 create mode 100644 libsycl/test/basic/get_backend.cpp
 create mode 100644 libsycl/test/basic/submit_fn_ptr.cpp

diff --git a/libsycl/docs/index.rst b/libsycl/docs/index.rst
index 9aa36b4a54c57..5961eeeedcedb 100644
--- a/libsycl/docs/index.rst
+++ b/libsycl/docs/index.rst
@@ -113,6 +113,10 @@ TODO for added SYCL classes
   * to implement submit & copy with accessors (low priority)
   * get_info & properties
   * ctors that accepts context (blocked by lack of liboffload support)
+  * nd_range kernel submissions
+  * cross-context events wait (host tasks are needed)
+  * implement check if lambda arguments are device copyable (requires clang support of corresponding builtins)
+  * kernel instantiating on host (debugging purposes)
 
 * ``property_list``: to fully implement and integrate with existing SYCL runtime classes supporting it
 * usm allocations:
diff --git a/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp b/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
new file mode 100644
index 0000000000000..96f60a3121787
--- /dev/null
+++ b/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
@@ -0,0 +1,135 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains helper functions used to wrap kernel arguments to
+/// typeless collection.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBSYCL___IMPL_DETAIL_ARG_WRAPPER_HPP
+#define _LIBSYCL___IMPL_DETAIL_ARG_WRAPPER_HPP
+
+#include <sycl/__impl/detail/config.hpp>
+#include <sycl/__impl/exception.hpp>
+
+#include <cassert>
+#include <memory>
+
+_LIBSYCL_BEGIN_NAMESPACE_SYCL
+
+namespace detail {
+
+/// Base class is needed for unification, we pass arguments through ABI
+/// boundary.
+class ArgWrapperBase {
+public:
+  ArgWrapperBase(const ArgWrapperBase &) = delete;
+  ArgWrapperBase &operator=(const ArgWrapperBase &) = delete;
+  virtual ~ArgWrapperBase() = default;
+
+  virtual void deepCopy() = 0;
+  virtual size_t getSize() const = 0;
+  virtual const void *getPtr() const = 0;
+
+protected:
+  ArgWrapperBase() = default;
+};
+
+/// Helps to manage arguments in a typeless way.
+template <typename Type> class ArgWrapper : public ArgWrapperBase {
+public:
+  ArgWrapper(Type &Arg) { Ptr = &Arg; }
+  ArgWrapper(const ArgWrapper &) = delete;
+  ArgWrapper &operator=(const ArgWrapper &) = delete;
+
+  /// \return size of argument in bytes.
+  size_t getSize() const override { return sizeof(Type); }
+
+  /// Returns raw pointer to the corresponding argument.
+  /// No copy is done by this method. It works with pointer to the memory whose
+  /// existence must be guaranteed by class user or with copy that must be
+  /// explicitly requested by class user via deepCopy method.
+  /// \return pointer to the argument.
+  const void *getPtr() const override {
+    assert((!DeepCopy || (DeepCopy.get()) == Ptr) &&
+           "Incorrect state of copied argument");
+    return Ptr;
+  }
+
+  /// Copies agrument to RT owned storage.
+  void deepCopy() override {
+    if (DeepCopy)
+      return;
+
+    DeepCopy.reset(new Type(*Ptr));
+    Ptr = DeepCopy.get();
+  }
+
+private:
+  Type *Ptr;
+  std::unique_ptr<Type> DeepCopy;
+};
+
+/// Collection of arguments. Provides functionality to accumulate all arguments
+/// data to pass through ABI boundary.
+class ArgCollection {
+public:
+  /// Adds argument to the collection. Don't own the memory. Argument lifetime
+  /// must be guaranteed by class user. If extended lifetime is needed (copy),
+  /// deepCopy must be called.
+  template <typename Type> void addArg(Type &Arg) {
+    MArgs.emplace_back(new ArgWrapper(Arg));
+  }
+
+  /// \return array of argument pointers.
+  const void **getArgPtrArray() {
+    if (MPtrs.size() != MArgs.size()) {
+      MPtrs.clear();
+      MPtrs.reserve(MArgs.size());
+      auto it = MArgs.cbegin();
+      while (it != MArgs.cend()) {
+        MPtrs.push_back((*it++)->getPtr());
+      }
+    }
+    return MPtrs.data();
+  }
+
+  /// \return array of argument sizes.
+  int64_t *getSizesArray() {
+    if (MSizes.size() != MArgs.size()) {
+      MSizes.clear();
+      MSizes.reserve(MArgs.size());
+      auto it = MArgs.cbegin();
+      while (it != MArgs.cend()) {
+        MSizes.push_back(static_cast<int64_t>((*it++)->getSize()));
+      }
+    }
+    return MSizes.data();
+  }
+
+  /// \return count of arguments in collection.
+  size_t getArgCount() { return MArgs.size(); }
+
+  /// Extends arguments lifetime by doing copy of all arguments.
+  void deepCopy() {
+    for (auto &Arg : MArgs)
+      Arg->deepCopy();
+  }
+
+private:
+  std::vector<std::unique_ptr<ArgWrapperBase>> MArgs;
+  std::vector<int64_t> MSizes;
+  std::vector<const void *> MPtrs;
+};
+
+} // namespace detail
+
+_LIBSYCL_END_NAMESPACE_SYCL
+
+#endif // _LIBSYCL___IMPL_DETAIL_ARG_WRAPPER_HPP
diff --git a/libsycl/include/sycl/__impl/detail/unified_range_view.hpp b/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
new file mode 100644
index 0000000000000..afa613fc8627b
--- /dev/null
+++ b/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains helper function class to unify ABI for different kernel
+/// ranges.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBSYCL___IMPL_DETAIL_UNIFIED_RANGE_VIEW_HPP
+#define _LIBSYCL___IMPL_DETAIL_UNIFIED_RANGE_VIEW_HPP
+
+#include <sycl/__impl/detail/config.hpp>
+
+_LIBSYCL_BEGIN_NAMESPACE_SYCL
+
+namespace detail {
+
+/// The structure to keep dimension and references to ranges unified for
+/// all dimensions.
+class UnifiedRangeView {
+
+public:
+  /// Default contructed view matches single task execution range.
+  UnifiedRangeView() = default;
+  UnifiedRangeView(const UnifiedRangeView &Desc) = default;
+  UnifiedRangeView(UnifiedRangeView &&Desc) = default;
+  UnifiedRangeView &operator=(const UnifiedRangeView &Desc) = default;
+  UnifiedRangeView &operator=(UnifiedRangeView &&Desc) = default;
+
+  // TODO: ctors with sycl::range and nd::range will be added later.
+
+  UnifiedRangeView(const size_t *GlobalSize, const size_t *LocalSize,
+                   const size_t *Offset, size_t Dims)
+      : MGlobalSize(GlobalSize), MLocalSize(LocalSize), MOffset(Offset),
+        MDims(Dims) {}
+
+  const size_t *MGlobalSize = nullptr;
+  const size_t *MLocalSize = nullptr;
+  const size_t *MOffset = nullptr;
+  size_t MDims = 1;
+};
+} // namespace detail
+
+_LIBSYCL_END_NAMESPACE_SYCL
+
+#endif // _LIBSYCL___IMPL_DETAIL_UNIFIED_RANGE_VIEW_HPP
diff --git a/libsycl/include/sycl/__impl/queue.hpp b/libsycl/include/sycl/__impl/queue.hpp
index 587f56a8eb245..d1ac320433c38 100644
--- a/libsycl/include/sycl/__impl/queue.hpp
+++ b/libsycl/include/sycl/__impl/queue.hpp
@@ -20,9 +20,11 @@
 #include <sycl/__impl/event.hpp>
 #include <sycl/__impl/property_list.hpp>
 
+#include <sycl/__impl/detail/arg_wrapper.hpp>
 #include <sycl/__impl/detail/config.hpp>
 #include <sycl/__impl/detail/default_async_handler.hpp>
 #include <sycl/__impl/detail/obj_utils.hpp>
+#include <sycl/__impl/detail/unified_range_view.hpp>
 
 _LIBSYCL_BEGIN_NAMESPACE_SYCL
 
@@ -31,6 +33,27 @@ class context;
 namespace detail {
 class QueueImpl;
 
+template <typename, typename T> struct CheckFunctionSignature {
+  static_assert(std::integral_constant<T, false>::value,
+                "Second template parameter is required to be of function type");
+};
+
+template <typename F, typename RetT, typename... Args>
+struct CheckFunctionSignature<F, RetT(Args...)> {
+private:
+  template <typename T>
+  static constexpr auto check(T *) -> typename std::is_same<
+      decltype(std::declval<T>().operator()(std::declval<Args>()...)),
+      RetT>::type;
+
+  template <typename> static constexpr std::false_type check(...);
+
+  using type = decltype(check<F>(0));
+
+public:
+  static constexpr bool value = type::value;
+};
+
 } // namespace detail
 
 // SYCL 2020 4.6.5. Queue class.
@@ -138,12 +161,85 @@ class _LIBSYCL_EXPORT queue {
   template <typename Param>
   typename Param::return_type get_backend_info() const;
 
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type.
+  ///
+  /// \param kernelFunc is the kernel functor or lambda.
+  /// \return an event that represents the status of the submitted kernel.
+  template <typename KernelName, typename KernelType>
+  event single_task(const KernelType &kernelFunc) {
+    return single_task<KernelName, KernelType>({}, kernelFunc);
+  }
+
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type.
+  ///
+  /// \param depEvent is an event that specifies the kernel dependency.
+  /// \param kernelFunc is the kernel functor or lambda.
+  /// \return an event that represents the status of the submitted kernel.
+  template <typename KernelName, typename KernelType>
+  event single_task(event depEvent, const KernelType &kernelFunc) {
+    return single_task<KernelName, KernelType>({depEvent}, kernelFunc);
+  }
+
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type.
+  ///
+  /// \param depEvents is a collection of events which specify the kernel
+  /// dependencies.
+  /// \param kernelFunc is the kernel functor or lambda.
+  /// \return an event that represents the status of the submitted kernel.
+  template <typename KernelName, typename KernelType>
+  event single_task(const std::vector<event> &depEvents,
+                    const KernelType &kernelFunc) {
+    static_assert(
+        (detail::CheckFunctionSignature<std::remove_reference_t<KernelType>,
+                                        void()>::value),
+        "sycl::queue::single_task() requires a kernel instead of command "
+        "group. ");
+
+    setKernelParameters(depEvents);
+    submitSingleTask<KernelName, KernelType>(kernelFunc);
+    return getLastEvent();
+  }
+
   /// Blocks the calling thread until all commands previously submitted to this
   /// queue have completed. Synchronous errors are reported through SYCL
   /// exceptions.
   void wait();
 
 private:
+  // Name of this function is defined by compiler. It generates call to this
+  // function in the host implementation of KernelFunc in submitSingleTask.
+  template <typename, typename... Args>
+  void sycl_kernel_launch(const char *KernelName, Args &&...args) {
+    static_assert((sizeof...(args) == 1) &&
+                  "Only 2 arguments are expected in sycl_kernel_launch.");
+    detail::ArgCollection TypelessArgs;
+    (TypelessArgs.addArg(args), ...);
+
+    submitKernelImpl(KernelName, TypelessArgs);
+  }
+
+#ifdef SYCL_LANGUAGE_VERSION
+#  define _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)                              \
+    [[clang::sycl_kernel_entry_point(KernelName)]]
+#else
+#  define _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)
+#endif // SYCL_LANGUAGE_VERSION
+
+  template <typename KernelName, typename KernelType>
+  _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)
+  void submitSingleTask(const KernelType KernelFunc) {
+    KernelFunc();
+  }
+
+  event getLastEvent();
+  void submitKernelImpl(const char *KernelName,
+                        detail::ArgCollection &TypelessArgs);
+  void setKernelParameters(const std::vector<event> &Events,
+                           const detail::UnifiedRangeView &Range = {});
+
   queue(const std::shared_ptr<detail::QueueImpl> &Impl) : impl(Impl) {}
   std::shared_ptr<detail::QueueImpl> impl;
 
diff --git a/libsycl/src/detail/queue_impl.cpp b/libsycl/src/detail/queue_impl.cpp
index 74ccc48877c09..243f38612e74c 100644
--- a/libsycl/src/detail/queue_impl.cpp
+++ b/libsycl/src/detail/queue_impl.cpp
@@ -16,6 +16,32 @@ _LIBSYCL_BEGIN_NAMESPACE_SYCL
 
 namespace detail {
 
+static void setKernelLaunchArgs(const detail::UnifiedRangeView &Range,
+                                ol_kernel_launch_size_args_t &ArgsToSet) {
+  size_t GlobalSize[3] = {1, 1, 1};
+  if (Range.MGlobalSize) {
+    for (uint32_t I = 0; I < Range.MDims; I++) {
+      GlobalSize[I] = Range.MGlobalSize[I];
+    }
+  }
+
+  size_t GroupSize[3] = {1, 1, 1};
+  if (Range.MLocalSize) {
+    for (uint32_t I = 0; I < Range.MDims; I++) {
+      GroupSize[I] = Range.MLocalSize[I];
+    }
+  }
+
+  ArgsToSet.Dimensions = Range.MDims;
+  ArgsToSet.NumGroups.x = GlobalSize[0] / GroupSize[0];
+  ArgsToSet.NumGroups.y = GlobalSize[1] / GroupSize[1];
+  ArgsToSet.NumGroups.z = GlobalSize[2] / GroupSize[2];
+  ArgsToSet.GroupSize.x = GroupSize[0];
+  ArgsToSet.GroupSize.y = GroupSize[1];
+  ArgsToSet.GroupSize.z = GroupSize[2];
+  ArgsToSet.DynSharedMemory = 0;
+}
+
 QueueImpl::QueueImpl(DeviceImpl &deviceImpl, const async_handler &asyncHandler,
                      const property_list &propList, PrivateTag)
     : MIsInorder(false), MAsyncHandler(asyncHandler), MPropList(propList),
@@ -34,5 +60,91 @@ backend QueueImpl::getBackend() const noexcept { return MDevice.getBackend(); }
 
 void QueueImpl::wait() { callAndThrow(olSyncQueue, MOffloadQueue); }
 
+static bool checkEventsPlatformMatch(std::vector<EventImplPtr> &Events,
+                                     const PlatformImpl &QueuePlatform) {
+  // liboffload limitation to olWaitEvents. We can't do any extra handling for
+  // cross context/platform events without host task support now.
+  //   "The input events can be from any queue on any device provided by the
+  //   same platform as `Queue`."
+  return std::all_of(Events.cbegin(), Events.cend(),
+                     [&QueuePlatform](const EventImplPtr &Event) {
+                       return &Event->getPlatformImpl() == &QueuePlatform;
+                     });
+}
+
+void QueueImpl::setKernelParameters(std::vector<EventImplPtr> &&Events,
+                                    const detail::UnifiedRangeView &Range) {
+  if (!checkEventsPlatformMatch(Events, MDevice.getPlatformImpl()))
+    throw sycl::exception(
+        sycl::make_error_code(sycl::errc::feature_not_supported),
+        "libsycl doesn't support cross-context/platform event dependencies "
+        "now.");
+
+  // TODO: this convertion and storing only offload events is possible only
+  // while we don't have host tasks (and features based on host tasks, like
+  // streams). With them - it is very likely we should copy EventImplPtr
+  // (shared_ptr) and keep it here. Although it may differ if host tasks will be
+  // implemented on offload level (no data now).
+  assert(MCurrentSubmitInfo.DepEvents.empty() &&
+         "Kernel submission must clean up dependencies.");
+  MCurrentSubmitInfo.DepEvents.reserve(Events.size());
+  for (auto &Event : Events) {
+    assert(Event && "Event impl object can't be nullptr");
+    MCurrentSubmitInfo.DepEvents.push_back(Event->getHandle());
+  }
+  setKernelLaunchArgs(Range, MCurrentSubmitInfo.Range);
+}
+
+void QueueImpl::submitKernelImpl(const char *KernelName,
+                                 detail::ArgCollection &TypelessArgs) {
+  ol_symbol_handle_t Kernel =
+      detail::ProgramManager::getInstance().getOrCreateKernel(KernelName,
+                                                              MDevice);
+  assert(Kernel);
+
+  ol_event_handle_t NewEvent{};
+  if (!MCurrentSubmitInfo.DepEvents.empty()) {
+    callAndThrow(olWaitEvents, MOffloadQueue,
+                 MCurrentSubmitInfo.DepEvents.data(),
+                 MCurrentSubmitInfo.DepEvents.size());
+  }
+
+  const void *Arguments = nullptr;
+  int64_t ArgumentsSize = 0;
+  if (TypelessArgs.getArgCount()) {
+    // without decomposition and free functions extension we always expect 1
+    // argument to the kernel - lambda capture.
+    assert(TypelessArgs.getArgCount() == 1 &&
+           "No arg decomposition or extensions are supported now.");
+    // TODO: liboffload doesn't support more than 1 argument without copy now.
+    // It doesn't expect array of arguments, it requires a contiguous memory
+    // with args. While we have only 1 argument we don't need extra handling
+    // here, we just pass the first argument directly.
+    Arguments = TypelessArgs.getArgPtrArray()[0];
+    ArgumentsSize = TypelessArgs.getSizesArray()[0];
+  }
+
+  // ol_kernel_launch_prop_t Props[2];
+  // Props[0].type = OL_KERNEL_LAUNCH_PROP_TYPE_SIZE;
+  // Props[0].data = &ArgumentsSize;
+  // Props[1] = OL_KERNEL_LAUNCH_PROP_END;
+  auto Result =
+      olLaunchKernel(MOffloadQueue, MDevice.getHandle(), Kernel, Arguments,
+                     ArgumentsSize, &MCurrentSubmitInfo.Range /*, Props*/);
+  // Clean up current kernel submit data to prepare structures for next
+  // submission.
+  MCurrentSubmitInfo.DepEvents.clear();
+  MCurrentSubmitInfo.Range = {};
+  if (isFailed(Result))
+    throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
+                          std::string("Kernel submission (") + KernelName +
+                              ") failed with " + formatCodeString(Result));
+
+  callAndThrow(olCreateEvent, MOffloadQueue, &NewEvent);
+
+  MCurrentSubmitInfo.LastEvent =
+      EventImpl::createEventWithHandle(NewEvent, MDevice.getPlatformImpl());
+}
+
 } // namespace detail
 _LIBSYCL_END_NAMESPACE_SYCL
diff --git a/libsycl/src/detail/queue_impl.hpp b/libsycl/src/detail/queue_impl.hpp
index cdb7595e852ec..6edb40471826a 100644
--- a/libsycl/src/detail/queue_impl.hpp
+++ b/libsycl/src/detail/queue_impl.hpp
@@ -15,6 +15,7 @@
 #include <OffloadAPI.h>
 
 #include <memory>
+#include <mutex>
 
 _LIBSYCL_BEGIN_NAMESPACE_SYCL
 namespace detail {
@@ -62,16 +63,52 @@ class QueueImpl : public std::enable_shared_from_this<QueueImpl> {
   /// \return true if and only if the queue is in order.
   bool isInOrder() const { return MIsInorder; }
 
+  /// Enqueues kernel to liboffload.
+  /// Kernel parameters like dependencies and range must be passed in advance by
+  /// calling setKernelParameters.
+  /// \param KernelName a name of kernel to be enqueued.
+  /// \param TypelessArgs data about kernel arguments to be used for enqueue.
+  void submitKernelImpl(const char *KernelName,
+                        detail::ArgCollection &TypelessArgs);
+
+  /// \return an event impl object that corresponds to the last kernel
+  /// submission in the calling thread.
+  EventImplPtr getLastEvent() {
+    assert(MCurrentSubmitInfo.LastEvent &&
+           "getLastEvent must be called after enqueue");
+    return MCurrentSubmitInfo.LastEvent;
+  }
+
+  /// Sets kernel parameters to be used in the next submitKernelImpl call.
+  /// Must be called prior to submitKernelImpl call.
+  /// \param Events a collection of events that kernal depends on.
+  /// \param Range a unified range view of execution range.
+  void setKernelParameters(std::vector<EventImplPtr> &&Events,
+                           const detail::UnifiedRangeView &Range);
+
   /// Waits for completion of all kernels submitted to this queue.
   void wait();
 
 private:
+  // Queue features.
   ol_queue_handle_t MOffloadQueue = {};
   const bool MIsInorder;
   const async_handler MAsyncHandler;
   const property_list MPropList;
   DeviceImpl &MDevice;
   ContextImpl &MContext;
+
+  // Submit data.
+  struct KernelSubmitInfo {
+    EventImplPtr LastEvent;
+    ol_kernel_launch_size_args_t Range;
+    // TODO: consider storing EventImplPtr here, it will work with plain handle
+    // only because submission is done within queue::submit call. Otherwise we
+    // need to ensure that event handle is still alive by keeping our own copy
+    // of EventImpl.
+    std::vector<ol_event_handle_t> DepEvents;
+  };
+  inline static thread_local KernelSubmitInfo MCurrentSubmitInfo = {};
 };
 
 } // namespace detail
diff --git a/libsycl/src/queue.cpp b/libsycl/src/queue.cpp
index 9fe020eabf2cc..f9d867e9567d7 100644
--- a/libsycl/src/queue.cpp
+++ b/libsycl/src/queue.cpp
@@ -33,6 +33,25 @@ device queue::get_device() const {
 
 bool queue::is_in_order() const { return impl->isInOrder(); }
 
+event queue::getLastEvent() {
+  return detail::createSyclObjFromImpl<event>(impl->getLastEvent());
+}
+
+void queue::setKernelParameters(const std::vector<event> &Events,
+                                const detail::UnifiedRangeView &Range) {
+  std::vector<detail::EventImplPtr> DepEventImplRefs;
+  DepEventImplRefs.reserve(Events.size());
+  for (const auto &Event : Events) {
+    DepEventImplRefs.push_back(detail::getSyclObjImpl(Event));
+  }
+  return impl->setKernelParameters(std::move(DepEventImplRefs), Range);
+}
+
+void queue::submitKernelImpl(const char *KernelName,
+                             detail::ArgCollection &TypelessArgs) {
+  impl->submitKernelImpl(KernelName, TypelessArgs);
+}
+
 void queue::wait() { return impl->wait(); }
 
 _LIBSYCL_END_NAMESPACE_SYCL
diff --git a/libsycl/test/basic/get_backend.cpp b/libsycl/test/basic/get_backend.cpp
new file mode 100644
index 0000000000000..064149a0c67e8
--- /dev/null
+++ b/libsycl/test/basic/get_backend.cpp
@@ -0,0 +1,54 @@
+// REQUIRES: any-device
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %t.out
+
+#include <iostream>
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+class Kernel1;
+
+bool check(backend be) {
+  switch (be) {
+  case backend::opencl:
+  case backend::level_zero:
+  case backend::cuda:
+  case backend::hip:
+    return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
+inline void return_fail() {
+  std::cout << "Failed" << std::endl;
+  exit(1);
+}
+
+int main() {
+  for (const auto &plt : platform::get_platforms()) {
+    if (check(plt.get_backend()) == false) {
+      return_fail();
+    }
+
+    auto device = device::get_devices()[0];
+    if (device.get_backend() != plt.get_backend()) {
+      return_fail();
+    }
+
+    queue q(device);
+    if (q.get_backend() != plt.get_backend()) {
+      return_fail();
+    }
+
+    event e = q.single_task<Kernel1>([]() {});
+    if (e.get_backend() != plt.get_backend()) {
+      return_fail();
+    }
+  }
+  std::cout << "Passed" << std::endl;
+  return 0;
+}
diff --git a/libsycl/test/basic/submit_fn_ptr.cpp b/libsycl/test/basic/submit_fn_ptr.cpp
new file mode 100644
index 0000000000000..2a5ce832d4db2
--- /dev/null
+++ b/libsycl/test/basic/submit_fn_ptr.cpp
@@ -0,0 +1,18 @@
+// REQUIRES: any-device
+// RUN: %clangxx -fsycl  %s -o %t.out
+// RUN: %t.out
+
+#include <sycl/sycl.hpp>
+
+class Test;
+
+int main() {
+  sycl::queue q;
+  int *p = sycl::malloc_shared<int>(1, q);
+  *p = 0;
+  q.single_task<Test>([=]() { *p = 42; });
+  q.wait();
+  assert(*p == 42);
+  sycl::free(p, q);
+  return 0;
+}

>From 104ccef02b0d36581c7a60bcf6d7459284e8db64 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova at intel.com>
Date: Thu, 26 Mar 2026 07:05:33 -0700
Subject: [PATCH 2/7] draft

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>

add tests for parallel_for

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>

remove operators from index space classes

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>
---
 libsycl/docs/index.rst                        |   1 +
 .../sycl/__impl/detail/kernel_arg_helpers.hpp | 187 ++++++++
 .../sycl/__impl/detail/unified_range_view.hpp |   6 +-
 .../sycl/__impl/index_space_classes.hpp       | 413 ++++++++++++++++++
 libsycl/include/sycl/__impl/queue.hpp         | 223 ++++++++--
 libsycl/include/sycl/__spirv/spirv_vars.hpp   |  75 ++++
 .../test/basic/queue_parallel_for_generic.cpp |  47 ++
 libsycl/test/basic/wrapped_usm_pointers.cpp   | 111 +++++
 8 files changed, 1031 insertions(+), 32 deletions(-)
 create mode 100644 libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp
 create mode 100644 libsycl/include/sycl/__impl/index_space_classes.hpp
 create mode 100644 libsycl/include/sycl/__spirv/spirv_vars.hpp
 create mode 100644 libsycl/test/basic/queue_parallel_for_generic.cpp
 create mode 100644 libsycl/test/basic/wrapped_usm_pointers.cpp

diff --git a/libsycl/docs/index.rst b/libsycl/docs/index.rst
index 5961eeeedcedb..585d05a78987d 100644
--- a/libsycl/docs/index.rst
+++ b/libsycl/docs/index.rst
@@ -126,6 +126,7 @@ TODO for added SYCL classes
   * handle sub devices once they are implemented (blocked by liboffload support)
 
 * ``event``: get_wait_list, get_info, get_profiling_info, wait_and_throw & default ctor are not implemented
+* ``range``, ``id`` - to add operators
 * general opens:
 
   * define a way to report errors from object dtors.
\ No newline at end of file
diff --git a/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp b/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp
new file mode 100644
index 0000000000000..d4a0ea9f63ff2
--- /dev/null
+++ b/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp
@@ -0,0 +1,187 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// to add
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBSYCL___IMPL_DETAIL_KERNEL_ARG_HELPERS
+#define _LIBSYCL___IMPL_DETAIL_KERNEL_ARG_HELPERS
+
+#include <sycl/__impl/index_space_classes.hpp>
+
+#include <sycl/__impl/detail/config.hpp>
+
+#ifdef __SYCL_DEVICE_ONLY__
+#  include <sycl/__spirv/spirv_vars.hpp>
+#endif
+
+_LIBSYCL_BEGIN_NAMESPACE_SYCL
+
+namespace detail {
+
+/// \name  Helpers for the unnamed lambda extension.
+/// @{
+/// This class is the default kernel name template parameter type for kernel
+/// invocation APIs such as single_task.
+class AutoName {};
+
+/// Helper struct to get a kernel name type based on given Name and Type
+/// types: if Name is undefined (is a AutoName) then Type becomes
+/// the Name.
+template <typename Name, typename Type> struct get_kernel_name_t {
+  using name = Name;
+};
+
+/// Specialization for the case when Name is undefined.
+/// This is only legal with our compiler with the unnamed lambda extension or if
+/// the kernel is a functor object.
+template <typename Type> struct get_kernel_name_t<detail::AutoName, Type> {
+  using name = Type;
+};
+/// @}
+
+/// \name  Helpers to verify kernel lambda type.
+/// \brief Checks that the function is callable with operator().
+/// @{
+template <typename, typename T> struct CheckFunctionSignature {
+  static_assert(std::integral_constant<T, false>::value,
+                "Second template parameter is required to be of function type");
+};
+
+template <typename F, typename RetT, typename... Args>
+struct CheckFunctionSignature<F, RetT(Args...)> {
+private:
+  template <typename T>
+  static constexpr auto check(T *) -> typename std::is_same<
+      decltype(std::declval<T>().operator()(std::declval<Args>()...)),
+      RetT>::type;
+
+  template <typename> static constexpr std::false_type check(...);
+
+  using type = decltype(check<F>(0));
+
+public:
+  static constexpr bool value = type::value;
+};
+/// @}
+
+/// \name  Helpers to extract types of lambda arguments.
+/// @{
+template <typename RetType, typename Func, typename Arg>
+static Arg member_ptr_helper(RetType (Func::*)(Arg) const);
+
+// Non-const version of the above template to match functors whose
+// 'operator()' is declared w/o the 'const' qualifier.
+template <typename RetType, typename Func, typename Arg>
+static Arg member_ptr_helper(RetType (Func::*)(Arg));
+
+template <typename F, typename SuggestedArgType>
+decltype(member_ptr_helper(&F::operator())) argument_helper(int);
+
+template <typename F, typename SuggestedArgType>
+SuggestedArgType argument_helper(...);
+
+template <typename F, typename SuggestedArgType>
+using lambda_arg_type = decltype(argument_helper<F, SuggestedArgType>(0));
+
+#if __has_builtin(__type_pack_element)
+template <int N, typename... Ts>
+using nth_type_t = __type_pack_element<N, Ts...>;
+#else
+template <int N, typename T, typename... Ts> struct nth_type {
+  using type = typename nth_type<N - 1, Ts...>::type;
+};
+
+template <typename T, typename... Ts> struct nth_type<0, T, Ts...> {
+  using type = T;
+};
+
+template <int N, typename... Ts>
+using nth_type_t = typename nth_type<N, Ts...>::type;
+#endif
+/// @}
+
+template <typename T> T *declptr() { return static_cast<T *>(nullptr); }
+
+template <int N>
+static inline constexpr bool isValidDimensions = (N > 0) && (N < 4);
+
+/// Class provides helper functions for iteration space coordinates in kernel
+/// invocation on device.
+class Builder {
+public:
+  Builder() = delete;
+
+#ifdef __SYCL_DEVICE_ONLY__
+  /// \return a global index of work item currently being operated on by device.
+  template <int Dims> static const id<Dims> getElement(id<Dims> *) {
+    static_assert(isValidDimensions<Dims>, "invalid dimensions");
+    return __spirv::initBuiltInGlobalInvocationId<Dims, id<Dims>>();
+  }
+
+  /// Constructs item with the given data.
+  /// \param Extent a range representing the dimensions of the range of possible
+  /// values of the item.
+  /// \param Index a constituent id representing the work-item’s position in the
+  /// iteration space.
+  /// \param Offset an id representing the n-dimensional offset that should be
+  /// added to the global-ID of each work-item, if this item represents a global
+  /// range. Deprecated in SYCL 2020.
+  template <int Dims, bool WithOffset>
+  static std::enable_if_t<WithOffset, item<Dims, WithOffset>>
+  createItem(const range<Dims> &Extent, const id<Dims> &Index,
+             const id<Dims> &Offset) {
+    return item<Dims, WithOffset>(Extent, Index, Offset);
+  }
+
+  /// Constructs item with the given data.
+  /// \param Extent a range representing the dimensions of the range of possible
+  /// values of the item.
+  /// \param Index a constituent id representing the work-item’s position in the
+  /// iteration space.
+  template <int Dims, bool WithOffset>
+  static std::enable_if_t<!WithOffset, item<Dims, WithOffset>>
+  createItem(const range<Dims> &Extent, const id<Dims> &Index) {
+    return item<Dims, WithOffset>(Extent, Index);
+  }
+
+  /// Creates sycl::item instance for work item that is currently being operated
+  /// on.
+  template <int Dims, bool WithOffset>
+  static std::enable_if_t<WithOffset, const item<Dims, WithOffset>> getItem() {
+    static_assert(isValidDimensions<Dims>, "invalid dimensions");
+    id<Dims> GlobalId{__spirv::initBuiltInGlobalInvocationId<Dims, id<Dims>>()};
+    range<Dims> GlobalSize{__spirv::initBuiltInGlobalSize<Dims, range<Dims>>()};
+    id<Dims> GlobalOffset{__spirv::initBuiltInGlobalOffset<Dims, id<Dims>>()};
+    return createItem<Dims, true>(GlobalSize, GlobalId, GlobalOffset);
+  }
+
+  /// Creates sycl::item instance for work item that is currently being operated
+  /// on.
+  template <int Dims, bool WithOffset>
+  static std::enable_if_t<!WithOffset, const item<Dims, WithOffset>> getItem() {
+    static_assert(isValidDimensions<Dims>, "invalid dimensions");
+    id<Dims> GlobalId{__spirv::initBuiltInGlobalInvocationId<Dims, id<Dims>>()};
+    range<Dims> GlobalSize{__spirv::initBuiltInGlobalSize<Dims, range<Dims>>()};
+    return createItem<Dims, false>(GlobalSize, GlobalId);
+  }
+
+  /// \return a work item currently being operated on by device.
+  template <int Dims, bool WithOffset>
+  static auto getElement(item<Dims, WithOffset> *)
+      -> decltype(getItem<Dims, WithOffset>()) {
+    return getItem<Dims, WithOffset>();
+  }
+
+#endif // __SYCL_DEVICE_ONLY__
+};
+
+} // namespace detail
+
+_LIBSYCL_END_NAMESPACE_SYCL
+
+#endif // _LIBSYCL___IMPL_DETAIL_KERNEL_ARG_HELPERS
diff --git a/libsycl/include/sycl/__impl/detail/unified_range_view.hpp b/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
index afa613fc8627b..8f321349d4c2e 100644
--- a/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
+++ b/libsycl/include/sycl/__impl/detail/unified_range_view.hpp
@@ -17,6 +17,8 @@
 
 #include <sycl/__impl/detail/config.hpp>
 
+#include <sycl/__impl/index_space_classes.hpp>
+
 _LIBSYCL_BEGIN_NAMESPACE_SYCL
 
 namespace detail {
@@ -33,7 +35,9 @@ class UnifiedRangeView {
   UnifiedRangeView &operator=(const UnifiedRangeView &Desc) = default;
   UnifiedRangeView &operator=(UnifiedRangeView &&Desc) = default;
 
-  // TODO: ctors with sycl::range and nd::range will be added later.
+  template <int Dims>
+  UnifiedRangeView(sycl::range<Dims> &N)
+      : MGlobalSize(&(N[0])), MDims(size_t(Dims)) {}
 
   UnifiedRangeView(const size_t *GlobalSize, const size_t *LocalSize,
                    const size_t *Offset, size_t Dims)
diff --git a/libsycl/include/sycl/__impl/index_space_classes.hpp b/libsycl/include/sycl/__impl/index_space_classes.hpp
new file mode 100644
index 0000000000000..ef2897cee5307
--- /dev/null
+++ b/libsycl/include/sycl/__impl/index_space_classes.hpp
@@ -0,0 +1,413 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declaration of the SYCL 2020 ranges and index space
+/// identifiers (4.9.1.).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBSYCL___IMPL_INDEX_SPACE_CLASSES_HPP
+#define _LIBSYCL___IMPL_INDEX_SPACE_CLASSES_HPP
+
+#include <sycl/__impl/detail/config.hpp>
+
+_LIBSYCL_BEGIN_NAMESPACE_SYCL
+
+namespace detail {
+
+class Builder;
+
+/// Helper class for dimensions data management.
+template <int Dimensions = 1> class RawArray {
+  static_assert(Dimensions >= 1 && Dimensions <= 3,
+                "RawArray can only be 1, 2, or 3 Dimensional.");
+
+public:
+  /// Constructs one-dimensional instance and assign corresponding data to Dim0
+  /// value. Available only if Dimensions = 1.
+  template <int N = Dimensions, std::enable_if_t<N == 1, bool> = true>
+  RawArray(size_t Dim0 = 0) : MArray{Dim0} {}
+
+  /// Constructs two-dimensional instance and assign corresponding data.
+  /// Available only if Dimensions = 2.
+  template <int N = Dimensions, std::enable_if_t<N == 2, bool> = true>
+  RawArray(size_t Dim0, size_t Dim1) : MArray{Dim0, Dim1} {}
+
+  /// Constructs two-dimensional instance with zero-initialized corresponding
+  /// data. Available only if Dimensions = 2.
+  template <int N = Dimensions, std::enable_if_t<N == 2, bool> = true>
+  RawArray() : RawArray(0, 0) {}
+
+  /// Constructs three-dimensional instance and assign corresponding data.
+  /// Available only if Dimensions = 3.
+  template <int N = Dimensions, std::enable_if_t<N == 3, bool> = true>
+  RawArray(size_t Dim0, size_t Dim1, size_t Dim2) : MArray{Dim0, Dim1, Dim2} {}
+
+  /// Constructs three-dimensional instance with zero-initialized corresponding
+  /// data. Available only if Dimensions = 3.
+  template <int N = Dimensions, std::enable_if_t<N == 3, bool> = true>
+  RawArray() : RawArray(0, 0, 0) {}
+
+  /// Returns value for the specified dimension.
+  /// Results in undefined behavior if dimension is not in the range [0,
+  /// Dimensions).
+  /// \param Dimension a dimension to query data for.
+  /// \return value in array matching requested dimension.
+  std::size_t get(int Dimension) const noexcept { return MArray[Dimension]; }
+
+  /// Returns value for the specified dimension.
+  /// Results in undefined behavior if dimension is not in the range [0,
+  /// Dimensions).
+  /// \param Dimension a dimension to query data for.
+  /// \return value in array matching requested dimension.
+  std::size_t &operator[](int Dimension) noexcept { return MArray[Dimension]; }
+
+  /// Returns value for the specified dimension.
+  /// Results in undefined behavior if dimension is not in the range [0,
+  /// Dimensions).
+  /// \param Dimension a dimension to query data for.
+  /// \return value in array matching requested dimension.
+  std::size_t operator[](int Dimension) const noexcept {
+    return MArray[Dimension];
+  }
+
+  RawArray(const RawArray<Dimensions> &rhs) = default;
+  RawArray(RawArray<Dimensions> &&rhs) = default;
+  RawArray<Dimensions> &operator=(const RawArray<Dimensions> &rhs) = default;
+  RawArray<Dimensions> &operator=(RawArray<Dimensions> &&rhs) = default;
+  ~RawArray() = default;
+
+  friend bool operator==(const RawArray<Dimensions> &lhs,
+                         const RawArray<Dimensions> &rhs) {
+    for (int i = 0; i < Dimensions; ++i) {
+      if (lhs.MArray[i] != rhs.MArray[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  friend bool operator!=(const RawArray<Dimensions> &lhs,
+                         const RawArray<Dimensions> &rhs) {
+    for (int i = 0; i < Dimensions; ++i) {
+      if (lhs.MArray[i] != rhs.MArray[i]) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+protected:
+  size_t MArray[Dimensions];
+};
+} // namespace detail
+
+/// SYCL 2020 4.9.1.1. range class.
+/// range<int Dimensions> is a 1D, 2D or 3D vector that defines the iteration
+/// domain of either a single work-group in a parallel dispatch, or the overall
+/// Dimensions of the dispatch.
+template <int Dimensions = 1>
+class range : public detail::RawArray<Dimensions> {
+  static_assert(Dimensions >= 1 && Dimensions <= 3,
+                "range can only be 1, 2, or 3 Dimensional.");
+  using Base = detail::RawArray<Dimensions>;
+
+public:
+  static constexpr int dimensions = Dimensions;
+  range() noexcept = default;
+  range(const range<Dimensions> &rhs) = default;
+  range(range<Dimensions> &&rhs) = default;
+  range<Dimensions> &operator=(const range<Dimensions> &rhs) = default;
+  range<Dimensions> &operator=(range<Dimensions> &&rhs) = default;
+
+  /// Construct a 1D range with value dim0.
+  ///  Only valid when the template parameter Dimensions is equal to 1.
+  template <int N = Dimensions, std::enable_if_t<N == 1, bool> = true>
+  range(std::size_t dim0) noexcept : Base(dim0) {}
+
+  /// Construct a 2D range with values dim0 and dim1.
+  /// Only valid when the template parameter Dimensions is equal to 2.
+  template <int N = Dimensions, std::enable_if_t<N == 2, bool> = true>
+  range(std::size_t dim0, std::size_t dim1) noexcept : Base(dim0, dim1) {}
+
+  /// Construct a 3D range with values dim0, dim1 and dim2.
+  /// Only valid when the template parameter Dimensions is equal to 3.
+  template <int N = Dimensions, std::enable_if_t<N == 3, bool> = true>
+  range(std::size_t dim0, std::size_t dim1, std::size_t dim2) noexcept
+      : Base(dim0, dim1, dim2) {}
+
+  /*
+  Declared and implemented in detail::RawArray:
+      std::size_t get(int dimension) const noexcept;
+      std::size_t& operator[](int dimension) noexcept;
+      std::size_t operator[](int dimension) const noexcept;
+  */
+
+  /// \return the size of the range computed as dimension0*…*dimensionN.
+  std::size_t size() const noexcept {
+    std::size_t size = 1;
+    for (int i = 0; i < Dimensions; ++i) {
+      size *= Base::MArray[i];
+    }
+    return size;
+  }
+
+  // TODO: operators to be added
+};
+
+/// c++ deduction guides.
+#ifdef __cpp_deduction_guides
+range(std::size_t) -> range<1>;
+range(std::size_t, std::size_t) -> range<2>;
+range(std::size_t, std::size_t, std::size_t) -> range<3>;
+#endif
+
+template <int Dimensions = 1, bool WithOffset = true> class item;
+
+/// SYCL 2020 4.9.1.3. id class.
+/// id<int Dimensions> is a vector of Dimensions that is used to represent an id
+/// into a global or local range. It can be used as an index in an accessor of
+/// the same rank.
+template <int Dimensions = 1> class id : public detail::RawArray<Dimensions> {
+  static_assert(Dimensions >= 1 && Dimensions <= 3,
+                "id can only be 1, 2, or 3 Dimensional.");
+  using Base = detail::RawArray<Dimensions>;
+
+  // Helper class for conversion operator. Void type is not suitable. User
+  // cannot even try to get address of the operator PrivateTag(). User
+  // may try to get an address of operator void() and will get the
+  // compile-time error
+  class PrivateTag;
+  template <bool Condition, typename T>
+  using EnableIfT = std::conditional_t<Condition, T, PrivateTag>;
+
+public:
+  static constexpr int dimensions = Dimensions;
+
+  id() noexcept = default;
+  id(const id<Dimensions> &rhs) = default;
+  id(id<Dimensions> &&rhs) = default;
+  id<Dimensions> &operator=(const id<Dimensions> &rhs) = default;
+  id<Dimensions> &operator=(id<Dimensions> &&rhs) = default;
+
+  /// Construct a 1D id with value dim0.
+  /// Only valid when the template parameter Dimensions is equal to 1.
+  template <int N = Dimensions, std::enable_if_t<N == 1, bool> = true>
+  id(std::size_t dim0) noexcept : Base(dim0) {}
+
+  /// Construct a 2D id with values dim0, dim1.
+  /// Only valid when the template parameter Dimensions is equal to 2.
+  template <int N = Dimensions, std::enable_if_t<N == 2, bool> = true>
+  id(std::size_t dim0, std::size_t dim1) noexcept : Base(dim0, dim1) {}
+
+  /// Construct a 3D id with values dim0, dim1, dim2.
+  /// Only valid when the template parameter Dimensions is equal to 3.
+  template <int N = Dimensions, std::enable_if_t<N == 3, bool> = true>
+  id(std::size_t dim0, std::size_t dim1, std::size_t dim2) noexcept
+      : Base(dim0, dim1, dim2) {}
+
+  /// Construct an id from the dimensions of range.
+  /// Only valid when the template parameter Dimensions is equal to 1.
+  template <int N = Dimensions, std::enable_if_t<N == 1, bool> = true>
+  id(const range<Dimensions> &range) noexcept : Base(range.get(0)) {}
+
+  /// Construct an id from the dimensions of range.
+  /// Only valid when the template parameter Dimensions is equal to 2.
+  template <int N = Dimensions, std::enable_if_t<N == 2, bool> = true>
+  id(const range<Dimensions> &range) noexcept
+      : Base(range.get(0), range.get(1)) {}
+
+  /// Construct an id from the dimensions of range.
+  /// Only valid when the template parameter Dimensions is equal to 3.
+  template <int N = Dimensions, std::enable_if_t<N == 3, bool> = true>
+  id(const range<Dimensions> &range) noexcept
+      : Base(range.get(0), range.get(1), range.get(2)) {}
+
+  /// Construct an id from item.get_id().
+  /// Only valid when the template parameter Dimensions is equal to 1.
+  template <int N = Dimensions, std::enable_if_t<N == 1, bool> = true>
+  id(const item<Dimensions> &item) noexcept : Base(item.get_id(0)) {}
+
+  /// Construct an id from item.get_id().
+  /// Only valid when the template parameter Dimensions is equal to 2.
+  template <int N = Dimensions, std::enable_if_t<N == 2, bool> = true>
+  id(const item<Dimensions> &item) noexcept
+      : Base(item.get_id(0), item.get_id(1)) {}
+
+  /// Construct an id from item.get_id().
+  /// Only valid when the template parameter Dimensions is equal to 3.
+  template <int N = Dimensions, std::enable_if_t<N == 3, bool> = true>
+  id(const item<Dimensions> &item) noexcept
+      : Base(item.get_id(0), item.get_id(1), item.get_id(2)) {}
+
+  /*
+    Declared and implemented in detail::RawArray:
+        std::size_t get(int dimension) const noexcept;
+        std::size_t& operator[](int dimension) noexcept;
+        std::size_t operator[](int dimension) const noexcept;
+    */
+
+  // Template operator is not allowed because it disables further type
+  //   conversion. For example, the next code will not work in case of template
+  //   conversion:
+  //   int a = id<1>(value);
+  /// Returns the same value as get(0).
+  ///  Available only when: Dimensions == 1.
+  operator EnableIfT<(Dimensions == 1), std::size_t>() const noexcept {
+    return Base::get(0);
+  }
+
+  // TODO: operators to be added
+};
+
+/// c++ deduction guides.
+#ifdef __cpp_deduction_guides
+id(std::size_t) -> id<1>;
+id(std::size_t, std::size_t) -> id<2>;
+id(std::size_t, std::size_t, std::size_t) -> id<3>;
+#endif
+
+/// SYCL 2020 4.9.1.4. item class.
+/// item identifies an instance of the function object executing at each point
+/// in a range.
+template <int Dimensions /* = 1*/, bool WithOffset /* = true*/> class item {
+  /* Helper class for conversion operator. Void type is not suitable. User
+   * cannot even try to get address of the operator PrivateTag(). User
+   * may try to get an address of operator void() and will get the
+   * compile-time error */
+  class PrivateTag;
+  template <bool Condition, typename T>
+  using EnableIfT = std::conditional_t<Condition, T, PrivateTag>;
+
+public:
+  static constexpr int dimensions = Dimensions;
+
+  item() = delete;
+
+  item(const item &rhs) = default;
+
+  item(item<Dimensions, WithOffset> &&rhs) = default;
+
+  item &operator=(const item &rhs) = default;
+
+  item &operator=(item &&rhs) = default;
+
+  friend bool operator==(const item<Dimensions, WithOffset> &lhs,
+                         const item<Dimensions, WithOffset> &rhs) {
+    if constexpr (WithOffset)
+      return (lhs.MId == rhs.MId) && (lhs.MRange == rhs.MRange) &&
+             (lhs.MOffset == rhs.MOffset);
+    else
+      return (lhs.MId == rhs.MId) && (lhs.MRange == rhs.MRange);
+  }
+
+  friend bool operator!=(const item<Dimensions, WithOffset> &lhs,
+                         const item<Dimensions, WithOffset> &rhs) {
+    return !(lhs == rhs);
+  }
+
+  /// \return the constituent id representing the work-item’s position in the
+  /// iteration space.
+  id<Dimensions> get_id() const noexcept { return MId; }
+
+  /// Equivalent to return get_id()[dimension].
+  std::size_t get_id(int dimension) const noexcept {
+    return MId.get(dimension);
+  }
+
+  /// Equivalent to return get_id(dimension).
+  std::size_t operator[](int dimension) const noexcept {
+    return MId[dimension];
+  }
+
+  /// \return a range representing the dimensions of the range of possible
+  /// values of the item.
+  range<Dimensions> get_range() const noexcept { return MRange; }
+
+  /// Equivalent to return get_range().get(dimension).
+  std::size_t get_range(int dimension) const noexcept {
+    return MRange[dimension];
+  }
+
+  /// Deprecated in SYCL 2020.
+  /// For an item converted from an item with no offset this will always return
+  /// an id of all 0 values. This member function is only available if
+  /// WithOffset is true.
+  /// \return an id representing the n-dimensional offset provided to the
+  /// parallel_for and that is added by the runtime to the global-ID of each
+  /// work-item, if this item represents a global range.
+  template <bool HasOffset = WithOffset,
+            std::enable_if_t<HasOffset == true, bool> = true>
+  id<Dimensions> get_offset() const noexcept {
+    return MOffset;
+  }
+
+  /// Deprecated in SYCL 2020.
+  /// This conversion allow users to seamlessly write code that assumes an
+  /// offset and still provides an offset-less item. Available only when:
+  /// WithOffset == false.
+  /// \return an item representing the same information as the object holds but
+  /// also includes the offset set to 0.
+  template <bool HasOffset = WithOffset,
+            std::enable_if_t<HasOffset == false, bool> = true>
+  operator item<Dimensions, true>() const noexcept {
+    return item<Dimensions, true>(MRange, MId, id<Dimensions>{});
+  }
+
+  /// Equivalent to get_id(0).
+  /// Available only when: Dimensions == 1.
+  operator EnableIfT<(Dimensions == 1), std::size_t>() const noexcept {
+    return get_id(0);
+  }
+
+  /// \return Return the id as a linear index value.
+  std::size_t get_linear_id() const noexcept {
+    if constexpr (WithOffset) {
+      if constexpr (1 == Dimensions) {
+        return MId;
+      }
+      if constexpr (2 == Dimensions) {
+        return (MId[0] - MOffset[0]) * MRange[1] + (MId[1] - MOffset[1]);
+      }
+      return ((MId[0] - MOffset[0]) * MRange[1] * MRange[2]) +
+             ((MId[1] - MOffset[1]) * MRange[2]) + (MId[2] - MOffset[2]);
+    } else {
+      if constexpr (1 == Dimensions) {
+        return MId[0];
+      }
+      if constexpr (2 == Dimensions) {
+        return MId[0] * MRange[1] + MId[1];
+      }
+      return (MId[0] * MRange[1] * MRange[2]) + (MId[1] * MRange[2]) + MId[2];
+    }
+  }
+
+protected:
+  template <bool HasOffset = WithOffset,
+            std::enable_if_t<HasOffset == true, bool> = true>
+  item(const sycl::range<Dimensions> &range, const sycl::id<Dimensions> &id,
+       const sycl::id<Dimensions> &offset)
+      : MRange(range), MId(id), MOffset(offset) {}
+
+  template <bool HasOffset = WithOffset,
+            std::enable_if_t<HasOffset == false, bool> = true>
+  item(const range<Dimensions> &range, const id<Dimensions> &id)
+      : MRange(range), MId(id), MOffset() {}
+
+private:
+  range<Dimensions> MRange;
+  id<Dimensions> MId;
+  id<Dimensions> MOffset;
+
+  friend class detail::Builder;
+};
+
+_LIBSYCL_END_NAMESPACE_SYCL
+
+#endif // _LIBSYCL___IMPL_INDEX_SPACE_CLASSES_HPP
diff --git a/libsycl/include/sycl/__impl/queue.hpp b/libsycl/include/sycl/__impl/queue.hpp
index d1ac320433c38..95653ab0c34ff 100644
--- a/libsycl/include/sycl/__impl/queue.hpp
+++ b/libsycl/include/sycl/__impl/queue.hpp
@@ -23,6 +23,7 @@
 #include <sycl/__impl/detail/arg_wrapper.hpp>
 #include <sycl/__impl/detail/config.hpp>
 #include <sycl/__impl/detail/default_async_handler.hpp>
+#include <sycl/__impl/detail/kernel_arg_helpers.hpp>
 #include <sycl/__impl/detail/obj_utils.hpp>
 #include <sycl/__impl/detail/unified_range_view.hpp>
 
@@ -32,28 +33,6 @@ class context;
 
 namespace detail {
 class QueueImpl;
-
-template <typename, typename T> struct CheckFunctionSignature {
-  static_assert(std::integral_constant<T, false>::value,
-                "Second template parameter is required to be of function type");
-};
-
-template <typename F, typename RetT, typename... Args>
-struct CheckFunctionSignature<F, RetT(Args...)> {
-private:
-  template <typename T>
-  static constexpr auto check(T *) -> typename std::is_same<
-      decltype(std::declval<T>().operator()(std::declval<Args>()...)),
-      RetT>::type;
-
-  template <typename> static constexpr std::false_type check(...);
-
-  using type = decltype(check<F>(0));
-
-public:
-  static constexpr bool value = type::value;
-};
-
 } // namespace detail
 
 // SYCL 2020 4.6.5. Queue class.
@@ -166,7 +145,7 @@ class _LIBSYCL_EXPORT queue {
   ///
   /// \param kernelFunc is the kernel functor or lambda.
   /// \return an event that represents the status of the submitted kernel.
-  template <typename KernelName, typename KernelType>
+  template <typename KernelName = detail::AutoName, typename KernelType>
   event single_task(const KernelType &kernelFunc) {
     return single_task<KernelName, KernelType>({}, kernelFunc);
   }
@@ -177,7 +156,7 @@ class _LIBSYCL_EXPORT queue {
   /// \param depEvent is an event that specifies the kernel dependency.
   /// \param kernelFunc is the kernel functor or lambda.
   /// \return an event that represents the status of the submitted kernel.
-  template <typename KernelName, typename KernelType>
+  template <typename KernelName = detail::AutoName, typename KernelType>
   event single_task(event depEvent, const KernelType &kernelFunc) {
     return single_task<KernelName, KernelType>({depEvent}, kernelFunc);
   }
@@ -189,7 +168,7 @@ class _LIBSYCL_EXPORT queue {
   /// dependencies.
   /// \param kernelFunc is the kernel functor or lambda.
   /// \return an event that represents the status of the submitted kernel.
-  template <typename KernelName, typename KernelType>
+  template <typename KernelName = detail::AutoName, typename KernelType>
   event single_task(const std::vector<event> &depEvents,
                     const KernelType &kernelFunc) {
     static_assert(
@@ -199,18 +178,169 @@ class _LIBSYCL_EXPORT queue {
         "group. ");
 
     setKernelParameters(depEvents);
-    submitSingleTask<KernelName, KernelType>(kernelFunc);
+    using NameT =
+        typename detail::get_kernel_name_t<KernelName, KernelType>::name;
+    submitSingleTask<NameT, KernelType>(kernelFunc);
     return getLastEvent();
   }
 
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type, for the specified range.
+  ///
+  /// \param numWorkItems specifies the global work space of the kernel.
+  /// \param rest acts as-if: const KernelType &KernelFunc".
+  // TODO: Rest will represent reduction types once it is supported.
+  template <typename KernelName = detail::AutoName, typename... Rest>
+  event parallel_for(range<1> numWorkItems, Rest &&...rest) {
+    return parallel_for<KernelName>(numWorkItems, {},
+                                    std::forward<Rest>(rest)...);
+  }
+
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type, for the specified range.
+  ///
+  /// \param numWorkItems specifies the global work space of the kernel.
+  /// \param rest acts as-if: const KernelType &KernelFunc".
+  // TODO: Rest will represent reduction types once it is supported.
+  template <typename KernelName = detail::AutoName, typename... Rest>
+  event parallel_for(range<2> numWorkItems, Rest &&...rest) {
+    return parallel_for<KernelName>(numWorkItems, {},
+                                    std::forward<Rest>(rest)...);
+  }
+
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type, for the specified range.
+  ///
+  /// \param numWorkItems specifies the global work space of the kernel.
+  /// \param rest acts as-if: const KernelType &KernelFunc".
+  // TODO: Rest will represent reduction types once it is supported.
+  template <typename KernelName = detail::AutoName, typename... Rest>
+  event parallel_for(range<3> numWorkItems, Rest &&...rest) {
+    return parallel_for<KernelName>(numWorkItems, {},
+                                    std::forward<Rest>(rest)...);
+  }
+
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type, for the specified range.
+  ///
+  /// \param numWorkItems specifies the global work space of the kernel.
+  /// \param depEvent adds a requirement that the action represented by depEvent
+  /// must complete before executing this kernel.
+  /// \param rest acts as-if: const KernelType &KernelFunc".
+  // TODO: Rest will represent reduction types once it is supported.
+  template <typename KernelName = detail::AutoName, typename... Rest>
+  event parallel_for(range<1> numWorkItems, event depEvent, Rest &&...rest) {
+    return parallel_for<KernelName>(numWorkItems, {depEvent},
+                                    std::forward<Rest>(rest)...);
+  }
+
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type, for the specified range.
+  ///
+  /// \param numWorkItems specifies the global work space of the kernel.
+  /// \param depEvent adds a requirement that the action represented by depEvent
+  /// must complete before executing this kernel.
+  /// \param rest acts as-if: const KernelType &KernelFunc".
+  // TODO: Rest will represent reduction types once it is supported.
+  template <typename KernelName = detail::AutoName, typename... Rest>
+  event parallel_for(range<2> numWorkItems, event depEvent, Rest &&...rest) {
+    return parallel_for<KernelName>(numWorkItems, {depEvent},
+                                    std::forward<Rest>(rest)...);
+  }
+
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type, for the specified range.
+  ///
+  /// \param numWorkItems specifies the global work space of the kernel.
+  /// \param depEvent adds a requirement that the action represented by depEvent
+  /// must complete before executing this kernel.
+  /// \param rest acts as-if: const KernelType &KernelFunc".
+  // TODO: Rest will represent reduction types once it is supported.
+  template <typename KernelName = detail::AutoName, typename... Rest>
+  event parallel_for(range<3> numWorkItems, event depEvent, Rest &&...rest) {
+    return parallel_for<KernelName>(numWorkItems, {depEvent},
+                                    std::forward<Rest>(rest)...);
+  }
+
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type, for the specified range.
+  ///
+  /// \param numWorkItems specifies the global work space of the kernel
+  /// \param depEvents is a vector of events that specifies the kernel
+  /// dependencies.
+  /// \param rest acts as-if: const KernelType &KernelFunc".
+  // TODO: Rest will represent reduction types once it is supported.
+  template <typename KernelName = detail::AutoName, typename... Rest>
+  event parallel_for(range<1> numWorkItems, const std::vector<event> &depEvents,
+                     Rest &&...rest) {
+    return parallelForImpl<KernelName>(numWorkItems, depEvents,
+                                       std::forward<Rest>(rest)...);
+  }
+
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type, for the specified range.
+  ///
+  /// \param numWorkItems specifies the global work space of the kernel
+  /// \param depEvents is a vector of events that specifies the kernel
+  /// dependencies.
+  /// \param rest acts as-if: const KernelType &KernelFunc".
+  // TODO: Rest will represent reduction types once it is supported.
+  template <typename KernelName = detail::AutoName, typename... Rest>
+  event parallel_for(range<2> numWorkItems, const std::vector<event> &depEvents,
+                     Rest &&...rest) {
+    return parallelForImpl<KernelName>(numWorkItems, depEvents,
+                                       std::forward<Rest>(rest)...);
+  }
+
+  /// Defines and invokes a SYCL kernel function as a lambda expression or a
+  /// named function object type, for the specified range.
+  ///
+  /// \param numWorkItems specifies the global work space of the kernel
+  /// \param depEvents is a vector of events that specifies the kernel
+  /// dependencies.
+  /// \param rest acts as-if: const KernelType &KernelFunc".
+  // TODO: Rest will represent reduction types once it is supported.
+  template <typename KernelName = detail::AutoName, typename... Rest>
+  event parallel_for(range<3> numWorkItems, const std::vector<event> &depEvents,
+                     Rest &&...rest) {
+    return parallelForImpl<KernelName>(numWorkItems, depEvents,
+                                       std::forward<Rest>(rest)...);
+  }
+
   /// Blocks the calling thread until all commands previously submitted to this
   /// queue have completed. Synchronous errors are reported through SYCL
   /// exceptions.
   void wait();
 
 private:
-  // Name of this function is defined by compiler. It generates call to this
-  // function in the host implementation of KernelFunc in submitSingleTask.
+  template <typename KernelName, int Dims, typename... Rest>
+  event parallelForImpl(range<Dims> numWorkItems,
+                        const std::vector<event> &depEvents, Rest &&...rest) {
+    if constexpr (sizeof...(Rest) != 1)
+      throw sycl::exception(errc::feature_not_supported,
+                            "Reductions are not supported.");
+    setKernelParameters(depEvents, numWorkItems);
+
+    using KernelType =
+        std::decay_t<detail::nth_type_t<sizeof...(Rest) - 1, Rest...>>;
+    using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
+    static_assert(
+        std::is_convertible_v<sycl::item<Dims>, LambdaArgType>,
+        "Kernel argument of a sycl::parallel_for with sycl::range "
+        "must be either sycl::item or be convertible from sycl::item");
+
+    using NameT =
+        typename detail::get_kernel_name_t<KernelName, KernelType>::name;
+    submitParallelFor<NameT, item<Dims>, KernelType>(rest...);
+    return getLastEvent();
+  }
+
+  /// Name of this function is defined by compiler. It generates call to this
+  /// function in the host implementation of KernelFunc in submitSingleTask or
+  /// submitParallelFor.
+  /// \param KernelName a name of the kernel being invoked.
+  /// \param args kernel arguments for kernel invocation.
+  // TODO: now `args` always represents  single argument - lambda capture.
   template <typename, typename... Args>
   void sycl_kernel_launch(const char *KernelName, Args &&...args) {
     static_assert((sizeof...(args) == 1) &&
@@ -221,6 +351,10 @@ class _LIBSYCL_EXPORT queue {
     submitKernelImpl(KernelName, TypelessArgs);
   }
 
+  /// The sycl_kernel_entry_point attribute facilitates the generation of an
+  /// offload kernel entry point function with parameters corresponding to the
+  /// (potentially decomposed) kernel arguments and a body that (potentially
+  /// reconstructs the arguments and) executes the kernel.
 #ifdef SYCL_LANGUAGE_VERSION
 #  define _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)                              \
     [[clang::sycl_kernel_entry_point(KernelName)]]
@@ -228,18 +362,45 @@ class _LIBSYCL_EXPORT queue {
 #  define _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)
 #endif // SYCL_LANGUAGE_VERSION
 
+  /// Specifies the parameters and body of the generated offload kernel entry
+  /// point for single_task invocations. On host compiler generates call to
+  /// sycl_kernel_launch instead of KernelFunc invocation.
   template <typename KernelName, typename KernelType>
   _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)
   void submitSingleTask(const KernelType KernelFunc) {
     KernelFunc();
   }
 
-  event getLastEvent();
-  void submitKernelImpl(const char *KernelName,
-                        detail::ArgCollection &TypelessArgs);
+  /// Specifies the parameters and body of the generated offload kernel entry
+  /// point for parallel_for invocations. On host compiler generates call to
+  /// sycl_kernel_launch instead of KernelFunc invocation.
+  template <typename KernelName, typename ElementType, typename KernelType>
+  _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)
+  void submitParallelFor(const KernelType KernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
+#endif
+    (void)KernelFunc;
+  }
+
+  /// Passes kernel parameters to runtime.
+  /// \param Events a collection of events representing dependencies of the
+  /// kernel to submit.
+  /// \param Range a unified view of range for kernel execution.
   void setKernelParameters(const std::vector<event> &Events,
                            const detail::UnifiedRangeView &Range = {});
 
+  /// Passes kernel arguments to runtime.
+  /// If all dependencies are met and kernel can be submitted to backend - it is
+  /// done in this call.
+  /// \param KernelName a name of the kernel being invoked.
+  /// \param TypelessArgs a unified arguments collection.
+  void submitKernelImpl(const char *KernelName,
+                        detail::ArgCollection &TypelessArgs);
+
+  /// \return an event representing last kernel invocation.
+  event getLastEvent();
+
   queue(const std::shared_ptr<detail::QueueImpl> &Impl) : impl(Impl) {}
   std::shared_ptr<detail::QueueImpl> impl;
 
diff --git a/libsycl/include/sycl/__spirv/spirv_vars.hpp b/libsycl/include/sycl/__spirv/spirv_vars.hpp
new file mode 100644
index 0000000000000..ec8c691b35e92
--- /dev/null
+++ b/libsycl/include/sycl/__spirv/spirv_vars.hpp
@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains SPIRV builtins needed for kernel invocations
+/// (parallel_for).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBSYCL___SPIRV_SPIRV_VARS
+#define _LIBSYCL___SPIRV_SPIRV_VARS
+
+#ifdef __SYCL_DEVICE_ONLY__
+
+#  include <cstddef>
+#  include <cstdint>
+
+// SPIR-V built-in variables mapped to function call.
+#  define _LIBSYCL_SYCL_DEVICE_ATTR __attribute__((sycl_external))
+
+_LIBSYCL_SYCL_DEVICE_ATTR __attribute__((const)) size_t
+__spirv_BuiltInGlobalInvocationId(int);
+_LIBSYCL_SYCL_DEVICE_ATTR __attribute__((const)) size_t
+__spirv_BuiltInGlobalSize(int);
+_LIBSYCL_SYCL_DEVICE_ATTR __attribute__((const)) size_t
+__spirv_BuiltInGlobalOffset(int);
+
+namespace __spirv {
+
+// Helper function templates to initialize and get vector component from SPIR-V
+// built-in variables
+#  define __SPIRV_DEFINE_INIT_AND_GET_HELPERS(POSTFIX)                         \
+    template <int ID> size_t get##POSTFIX();                                   \
+    template <> size_t get##POSTFIX<0>() { return __spirv_##POSTFIX(0); }      \
+    template <> size_t get##POSTFIX<1>() { return __spirv_##POSTFIX(1); }      \
+    template <> size_t get##POSTFIX<2>() { return __spirv_##POSTFIX(2); }      \
+                                                                               \
+    template <int Dim, class DstT> struct InitSizesST##POSTFIX;                \
+                                                                               \
+    template <class DstT> struct InitSizesST##POSTFIX<1, DstT> {               \
+      static DstT initSize() { return {get##POSTFIX<0>()}; }                   \
+    };                                                                         \
+                                                                               \
+    template <class DstT> struct InitSizesST##POSTFIX<2, DstT> {               \
+      static DstT initSize() {                                                 \
+        return {get##POSTFIX<1>(), get##POSTFIX<0>()};                         \
+      }                                                                        \
+    };                                                                         \
+                                                                               \
+    template <class DstT> struct InitSizesST##POSTFIX<3, DstT> {               \
+      static DstT initSize() {                                                 \
+        return {get##POSTFIX<2>(), get##POSTFIX<1>(), get##POSTFIX<0>()};      \
+      }                                                                        \
+    };                                                                         \
+                                                                               \
+    template <int Dims, class DstT> DstT init##POSTFIX() {                     \
+      return InitSizesST##POSTFIX<Dims, DstT>::initSize();                     \
+    }
+
+__SPIRV_DEFINE_INIT_AND_GET_HELPERS(BuiltInGlobalSize);
+__SPIRV_DEFINE_INIT_AND_GET_HELPERS(BuiltInGlobalInvocationId)
+__SPIRV_DEFINE_INIT_AND_GET_HELPERS(BuiltInGlobalOffset)
+
+#  undef __SPIRV_DEFINE_INIT_AND_GET_HELPERS
+
+} // namespace __spirv
+
+#endif //__SYCL_DEVICE_ONLY__
+
+#endif // _LIBSYCL___SPIRV_SPIRV_VARS
diff --git a/libsycl/test/basic/queue_parallel_for_generic.cpp b/libsycl/test/basic/queue_parallel_for_generic.cpp
new file mode 100644
index 0000000000000..cac423b85f218
--- /dev/null
+++ b/libsycl/test/basic/queue_parallel_for_generic.cpp
@@ -0,0 +1,47 @@
+// REQUIRES: any-device
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %t.out
+
+#include <sycl/sycl.hpp>
+
+#include <cassert>
+#include <iostream>
+#include <type_traits>
+
+int main() {
+  // TODO: uncomment property once it is implemented. now all sycl::queue
+  // objects are in-order due to liboffload limitation. Test is intended to
+  // check in-order execution.
+  sycl::queue q{/*sycl::property::queue::in_order()*/};
+  auto dev = q.get_device();
+  auto ctx = q.get_context();
+  constexpr int N = 8;
+
+  auto A = static_cast<int *>(sycl::malloc_shared(N * sizeof(int), dev, ctx));
+
+  for (int i = 0; i < N; i++) {
+    A[i] = 1;
+  }
+
+  q.parallel_for<class Bar>(N, [=](auto i) {
+    static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
+                  "lambda arg type is unexpected");
+    A[i]++;
+  });
+
+  q.parallel_for<class Foo>({N}, [=](auto i) {
+    static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
+                  "lambda arg type is unexpected");
+    A[i]++;
+  });
+
+  // TODO: add kernel with offset and kernel with nd_range once they
+  // are implemented.
+
+  q.wait();
+
+  for (int i = 0; i < N; i++) {
+    assert(A[i] == 3);
+  }
+  sycl::free(A, ctx);
+}
diff --git a/libsycl/test/basic/wrapped_usm_pointers.cpp b/libsycl/test/basic/wrapped_usm_pointers.cpp
new file mode 100644
index 0000000000000..16a86963cc976
--- /dev/null
+++ b/libsycl/test/basic/wrapped_usm_pointers.cpp
@@ -0,0 +1,111 @@
+// REQUIRES: any-device
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %t.out
+
+#include <sycl/sycl.hpp>
+
+#include <iostream>
+
+struct Simple {
+  int *Data;
+  int Addition;
+};
+
+struct WrapperOfSimple {
+  int Addition;
+  Simple Obj;
+};
+
+struct NonTrivial {
+  int Addition;
+  int *Data;
+
+  NonTrivial(int *D, int A) : Data(D), Addition(A) {}
+};
+
+struct NonTrivialDerived : NonTrivial {
+  int AA = 0;
+  NonTrivialDerived(int *D, int A) : NonTrivial(D, A) {}
+};
+
+using namespace sycl;
+
+int main() {
+  constexpr int NumOfElements = 7;
+
+  queue Q;
+
+  NonTrivial NonTrivialObj(sycl::malloc_shared<int>(NumOfElements, Q), 38);
+  NonTrivialDerived NonTrivialDerivedObj(
+      sycl::malloc_shared<int>(NumOfElements, Q), 39);
+  Simple SimpleObj = {sycl::malloc_shared<int>(NumOfElements, Q), 42};
+  WrapperOfSimple WrapperOfSimpleObj = {
+      300, {sycl::malloc_shared<int>(NumOfElements, Q), 100500}};
+
+  // Test simple struct containing pointer.
+  Q.parallel_for(NumOfElements, [=](id<1> Idx) {
+    SimpleObj.Data[Idx] = Idx + SimpleObj.Addition;
+  });
+
+  // Test simple non-trivial struct containing pointer.
+  Q.parallel_for(NumOfElements, [=](id<1> Idx) {
+    NonTrivialObj.Data[Idx] = Idx + NonTrivialObj.Addition;
+  });
+
+  // Test simple non-trivial derived struct containing pointer.
+  Q.parallel_for(NumOfElements, [=](id<1> Idx) {
+    NonTrivialDerivedObj.Data[Idx] = Idx + NonTrivialDerivedObj.Addition;
+  });
+
+  // Test nested struct containing pointer.
+  Q.parallel_for(NumOfElements, [=](id<1> Idx) {
+    WrapperOfSimpleObj.Obj.Data[Idx] = Idx + WrapperOfSimpleObj.Obj.Addition;
+  });
+
+  // Test array of structs containing pointers.
+  Simple SimpleArr[NumOfElements];
+  for (int i = 0; i < NumOfElements; ++i) {
+    SimpleArr[i].Data = sycl::malloc_shared<int>(NumOfElements, Q);
+    SimpleArr[i].Addition = 38 + i;
+  }
+
+  Q.parallel_for(range<2>(NumOfElements, NumOfElements), [=](item<2> Idx) {
+    SimpleArr[Idx.get_id(0)].Data[Idx.get_id(1)] =
+        Idx.get_id(1) + SimpleArr[Idx.get_id(0)].Addition;
+  });
+
+  Q.wait();
+
+  auto Checker = [](auto Obj) {
+    for (int i = 0; i < NumOfElements; ++i) {
+      if (Obj.Data[i] != (i + Obj.Addition)) {
+        std::cout << "line: " << __LINE__ << " result[" << i << "] is "
+                  << Obj.Data[i] << " expected " << i + Obj.Addition
+                  << std::endl;
+        return true; // true if fail
+      }
+    }
+
+    return false;
+  };
+
+  bool Fail = false;
+  Fail = Checker(SimpleObj);
+  Fail = Checker(NonTrivialObj);
+  Fail = Checker(NonTrivialDerivedObj);
+  Fail = Checker(WrapperOfSimpleObj.Obj);
+
+  for (int i = 0; i < NumOfElements; ++i)
+    Fail = Checker(SimpleArr[i]);
+
+  // Free allocated memory.
+  sycl::free(NonTrivialObj.Data, Q);
+  sycl::free(NonTrivialDerivedObj.Data, Q);
+  sycl::free(SimpleObj.Data, Q);
+  sycl::free(WrapperOfSimpleObj.Obj.Data, Q);
+
+  for (int i = 0; i < NumOfElements; ++i)
+    sycl::free(SimpleArr[i].Data, Q);
+
+  return Fail;
+}

>From a5b6a4c40d367b6f80d7ec43915d94a0707aa911 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova at intel.com>
Date: Fri, 27 Mar 2026 10:37:44 -0700
Subject: [PATCH 3/7] removed invalid comment

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>
---
 libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp b/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp
index d4a0ea9f63ff2..a7478e1300e21 100644
--- a/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp
+++ b/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// to add
-//===----------------------------------------------------------------------===//
 
 #ifndef _LIBSYCL___IMPL_DETAIL_KERNEL_ARG_HELPERS
 #define _LIBSYCL___IMPL_DETAIL_KERNEL_ARG_HELPERS

>From 60af5a987839fac6b250e7519a74fe33ef8ff3b4 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova at intel.com>
Date: Tue, 21 Apr 2026 04:40:29 -0700
Subject: [PATCH 4/7] fix merge errors

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>
---
 .../sycl/__impl/detail/arg_wrapper.hpp        | 135 ------------------
 libsycl/include/sycl/__impl/queue.hpp         |  43 ------
 2 files changed, 178 deletions(-)
 delete mode 100644 libsycl/include/sycl/__impl/detail/arg_wrapper.hpp

diff --git a/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp b/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
deleted file mode 100644
index 96f60a3121787..0000000000000
--- a/libsycl/include/sycl/__impl/detail/arg_wrapper.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file contains helper functions used to wrap kernel arguments to
-/// typeless collection.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBSYCL___IMPL_DETAIL_ARG_WRAPPER_HPP
-#define _LIBSYCL___IMPL_DETAIL_ARG_WRAPPER_HPP
-
-#include <sycl/__impl/detail/config.hpp>
-#include <sycl/__impl/exception.hpp>
-
-#include <cassert>
-#include <memory>
-
-_LIBSYCL_BEGIN_NAMESPACE_SYCL
-
-namespace detail {
-
-/// Base class is needed for unification, we pass arguments through ABI
-/// boundary.
-class ArgWrapperBase {
-public:
-  ArgWrapperBase(const ArgWrapperBase &) = delete;
-  ArgWrapperBase &operator=(const ArgWrapperBase &) = delete;
-  virtual ~ArgWrapperBase() = default;
-
-  virtual void deepCopy() = 0;
-  virtual size_t getSize() const = 0;
-  virtual const void *getPtr() const = 0;
-
-protected:
-  ArgWrapperBase() = default;
-};
-
-/// Helps to manage arguments in a typeless way.
-template <typename Type> class ArgWrapper : public ArgWrapperBase {
-public:
-  ArgWrapper(Type &Arg) { Ptr = &Arg; }
-  ArgWrapper(const ArgWrapper &) = delete;
-  ArgWrapper &operator=(const ArgWrapper &) = delete;
-
-  /// \return size of argument in bytes.
-  size_t getSize() const override { return sizeof(Type); }
-
-  /// Returns raw pointer to the corresponding argument.
-  /// No copy is done by this method. It works with pointer to the memory whose
-  /// existence must be guaranteed by class user or with copy that must be
-  /// explicitly requested by class user via deepCopy method.
-  /// \return pointer to the argument.
-  const void *getPtr() const override {
-    assert((!DeepCopy || (DeepCopy.get()) == Ptr) &&
-           "Incorrect state of copied argument");
-    return Ptr;
-  }
-
-  /// Copies agrument to RT owned storage.
-  void deepCopy() override {
-    if (DeepCopy)
-      return;
-
-    DeepCopy.reset(new Type(*Ptr));
-    Ptr = DeepCopy.get();
-  }
-
-private:
-  Type *Ptr;
-  std::unique_ptr<Type> DeepCopy;
-};
-
-/// Collection of arguments. Provides functionality to accumulate all arguments
-/// data to pass through ABI boundary.
-class ArgCollection {
-public:
-  /// Adds argument to the collection. Don't own the memory. Argument lifetime
-  /// must be guaranteed by class user. If extended lifetime is needed (copy),
-  /// deepCopy must be called.
-  template <typename Type> void addArg(Type &Arg) {
-    MArgs.emplace_back(new ArgWrapper(Arg));
-  }
-
-  /// \return array of argument pointers.
-  const void **getArgPtrArray() {
-    if (MPtrs.size() != MArgs.size()) {
-      MPtrs.clear();
-      MPtrs.reserve(MArgs.size());
-      auto it = MArgs.cbegin();
-      while (it != MArgs.cend()) {
-        MPtrs.push_back((*it++)->getPtr());
-      }
-    }
-    return MPtrs.data();
-  }
-
-  /// \return array of argument sizes.
-  int64_t *getSizesArray() {
-    if (MSizes.size() != MArgs.size()) {
-      MSizes.clear();
-      MSizes.reserve(MArgs.size());
-      auto it = MArgs.cbegin();
-      while (it != MArgs.cend()) {
-        MSizes.push_back(static_cast<int64_t>((*it++)->getSize()));
-      }
-    }
-    return MSizes.data();
-  }
-
-  /// \return count of arguments in collection.
-  size_t getArgCount() { return MArgs.size(); }
-
-  /// Extends arguments lifetime by doing copy of all arguments.
-  void deepCopy() {
-    for (auto &Arg : MArgs)
-      Arg->deepCopy();
-  }
-
-private:
-  std::vector<std::unique_ptr<ArgWrapperBase>> MArgs;
-  std::vector<int64_t> MSizes;
-  std::vector<const void *> MPtrs;
-};
-
-} // namespace detail
-
-_LIBSYCL_END_NAMESPACE_SYCL
-
-#endif // _LIBSYCL___IMPL_DETAIL_ARG_WRAPPER_HPP
diff --git a/libsycl/include/sycl/__impl/queue.hpp b/libsycl/include/sycl/__impl/queue.hpp
index 5f31777c09cf7..e3856d2f5b4b6 100644
--- a/libsycl/include/sycl/__impl/queue.hpp
+++ b/libsycl/include/sycl/__impl/queue.hpp
@@ -20,7 +20,6 @@
 #include <sycl/__impl/event.hpp>
 #include <sycl/__impl/property_list.hpp>
 
-#include <sycl/__impl/detail/arg_wrapper.hpp>
 #include <sycl/__impl/detail/config.hpp>
 #include <sycl/__impl/detail/default_async_handler.hpp>
 #include <sycl/__impl/detail/get_device_kernel_info.hpp>
@@ -311,48 +310,6 @@ class _LIBSYCL_EXPORT queue {
   /// exceptions.
   void wait();
 
-  /// Defines and invokes a SYCL kernel function as a lambda expression or a
-  /// named function object type.
-  ///
-  /// \param kernelFunc is the kernel functor or lambda.
-  /// \return an event that represents the status of the submitted kernel.
-  template <typename KernelName, typename KernelType>
-  event single_task(const KernelType &kernelFunc) {
-    return single_task<KernelName, KernelType>({}, kernelFunc);
-  }
-
-  /// Defines and invokes a SYCL kernel function as a lambda expression or a
-  /// named function object type.
-  ///
-  /// \param depEvent is an event that specifies the kernel dependency.
-  /// \param kernelFunc is the kernel functor or lambda.
-  /// \return an event that represents the status of the submitted kernel.
-  template <typename KernelName, typename KernelType>
-  event single_task(event depEvent, const KernelType &kernelFunc) {
-    return single_task<KernelName, KernelType>({depEvent}, kernelFunc);
-  }
-
-  /// Defines and invokes a SYCL kernel function as a lambda expression or a
-  /// named function object type.
-  ///
-  /// \param depEvents is a collection of events that specify the kernel
-  /// dependencies.
-  /// \param kernelFunc is the kernel functor or lambda.
-  /// \return an event that represents the status of the submitted kernel.
-  template <typename KernelName, typename KernelType>
-  event single_task(const std::vector<event> &depEvents,
-                    const KernelType &kernelFunc) {
-    static_assert(
-        detail::CheckFunctionSignature<std::remove_reference_t<KernelType>,
-                                       void()>::value,
-        "sycl::queue::single_task() requires a kernel instead of a command "
-        "group");
-
-    setKernelParameters(depEvents);
-    submitSingleTask<KernelName, KernelType>(kernelFunc);
-    return getLastEvent();
-  }
-
 private:
   template <typename KernelName, int Dims, typename... Rest>
   event parallelForImpl(range<Dims> numWorkItems,

>From 2861b1031d9c712c8c43c360b1fc88ab3ba388ec Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova at intel.com>
Date: Tue, 21 Apr 2026 08:46:55 -0700
Subject: [PATCH 5/7] fix more comments and revert some merge issues

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>
---
 .../sycl/__impl/detail/kernel_arg_helpers.hpp | 11 ++++++++--
 .../sycl/__impl/index_space_classes.hpp       | 20 +++++++++----------
 libsycl/include/sycl/__impl/queue.hpp         | 20 +++++++++----------
 libsycl/include/sycl/__spirv/spirv_vars.hpp   | 10 +++-------
 libsycl/include/sycl/sycl.hpp                 |  1 +
 libsycl/src/detail/queue_impl.cpp             |  1 +
 libsycl/src/detail/queue_impl.hpp             |  1 -
 libsycl/test/basic/wrapped_usm_pointers.cpp   | 10 +++++-----
 8 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp b/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp
index a7478e1300e21..f3d733981922a 100644
--- a/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp
+++ b/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp
@@ -5,6 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains helpers for kernel invocation.
+///
+//===----------------------------------------------------------------------===//
 
 #ifndef _LIBSYCL___IMPL_DETAIL_KERNEL_ARG_HELPERS
 #define _LIBSYCL___IMPL_DETAIL_KERNEL_ARG_HELPERS
@@ -17,11 +22,13 @@
 #  include <sycl/__spirv/spirv_vars.hpp>
 #endif
 
+#include <type_traits>
+
 _LIBSYCL_BEGIN_NAMESPACE_SYCL
 
 namespace detail {
 
-/// \name  Helpers for the unnamed lambda extension.
+/// \name  Helpers for the unnamed lambda.
 /// @{
 /// This class is the default kernel name template parameter type for kernel
 /// invocation APIs such as single_task.
@@ -35,7 +42,7 @@ template <typename Name, typename Type> struct get_kernel_name_t {
 };
 
 /// Specialization for the case when Name is undefined.
-/// This is only legal with our compiler with the unnamed lambda extension or if
+/// This is only legal with our compiler with the unnamed lambda support or if
 /// the kernel is a functor object.
 template <typename Type> struct get_kernel_name_t<detail::AutoName, Type> {
   using name = Type;
diff --git a/libsycl/include/sycl/__impl/index_space_classes.hpp b/libsycl/include/sycl/__impl/index_space_classes.hpp
index ef2897cee5307..0dc8e90decc3d 100644
--- a/libsycl/include/sycl/__impl/index_space_classes.hpp
+++ b/libsycl/include/sycl/__impl/index_space_classes.hpp
@@ -17,6 +17,9 @@
 
 #include <sycl/__impl/detail/config.hpp>
 
+#include <cstddef>
+#include <type_traits>
+
 _LIBSYCL_BEGIN_NAMESPACE_SYCL
 
 namespace detail {
@@ -95,12 +98,7 @@ template <int Dimensions = 1> class RawArray {
 
   friend bool operator!=(const RawArray<Dimensions> &lhs,
                          const RawArray<Dimensions> &rhs) {
-    for (int i = 0; i < Dimensions; ++i) {
-      if (lhs.MArray[i] != rhs.MArray[i]) {
-        return true;
-      }
-    }
-    return false;
+    return !(lhs == rhs);
   }
 
 protected:
@@ -370,13 +368,13 @@ template <int Dimensions /* = 1*/, bool WithOffset /* = true*/> class item {
   std::size_t get_linear_id() const noexcept {
     if constexpr (WithOffset) {
       if constexpr (1 == Dimensions) {
-        return MId;
+        return MId[0] - MOffset[0];
       }
       if constexpr (2 == Dimensions) {
-        return (MId[0] - MOffset[0]) * MRange[1] + (MId[1] - MOffset[1]);
+        return (MId[0] - MOffset[0]) * MRange[1] + MId[1] - MOffset[1];
       }
-      return ((MId[0] - MOffset[0]) * MRange[1] * MRange[2]) +
-             ((MId[1] - MOffset[1]) * MRange[2]) + (MId[2] - MOffset[2]);
+      return (MId[0] - MOffset[0]) * MRange[1] * MRange[2] +
+             (MId[1] - MOffset[1]) * MRange[2] + MId[2] - MOffset[2];
     } else {
       if constexpr (1 == Dimensions) {
         return MId[0];
@@ -384,7 +382,7 @@ template <int Dimensions /* = 1*/, bool WithOffset /* = true*/> class item {
       if constexpr (2 == Dimensions) {
         return MId[0] * MRange[1] + MId[1];
       }
-      return (MId[0] * MRange[1] * MRange[2]) + (MId[1] * MRange[2]) + MId[2];
+      return MId[0] * MRange[1] * MRange[2] + MId[1] * MRange[2] + MId[2];
     }
   }
 
diff --git a/libsycl/include/sycl/__impl/queue.hpp b/libsycl/include/sycl/__impl/queue.hpp
index e3856d2f5b4b6..ea96ee03da5ee 100644
--- a/libsycl/include/sycl/__impl/queue.hpp
+++ b/libsycl/include/sycl/__impl/queue.hpp
@@ -138,6 +138,11 @@ class _LIBSYCL_EXPORT queue {
   template <typename Param>
   typename Param::return_type get_backend_info() const;
 
+  /// Blocks the calling thread until all commands previously submitted to this
+  /// queue have completed. Synchronous errors are reported through SYCL
+  /// exceptions.
+  void wait();
+
   /// Defines and invokes a SYCL kernel function as a lambda expression or a
   /// named function object type.
   ///
@@ -172,8 +177,8 @@ class _LIBSYCL_EXPORT queue {
     static_assert(
         (detail::CheckFunctionSignature<std::remove_reference_t<KernelType>,
                                         void()>::value),
-        "sycl::queue::single_task() requires a kernel instead of command "
-        "group. ");
+        "sycl::queue::single_task() requires a kernel instead of a command "
+        "group");
 
     setKernelParameters(depEvents);
     using NameT =
@@ -305,18 +310,13 @@ class _LIBSYCL_EXPORT queue {
                                        std::forward<Rest>(rest)...);
   }
 
-  /// Blocks the calling thread until all commands previously submitted to this
-  /// queue have completed. Synchronous errors are reported through SYCL
-  /// exceptions.
-  void wait();
-
 private:
   template <typename KernelName, int Dims, typename... Rest>
   event parallelForImpl(range<Dims> numWorkItems,
                         const std::vector<event> &depEvents, Rest &&...rest) {
     if constexpr (sizeof...(Rest) != 1)
       throw sycl::exception(errc::feature_not_supported,
-                            "Reductions are not supported.");
+                            "Reductions are not supported");
     setKernelParameters(depEvents, numWorkItems);
 
     using KernelType =
@@ -366,7 +366,7 @@ class _LIBSYCL_EXPORT queue {
   /// sycl_kernel_launch instead of KernelFunc invocation.
   template <typename KernelName, typename KernelType>
   _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)
-  void submitSingleTask(const KernelType KernelFunc) {
+  void submitSingleTask(const KernelType &KernelFunc) {
     KernelFunc();
   }
 
@@ -375,7 +375,7 @@ class _LIBSYCL_EXPORT queue {
   /// sycl_kernel_launch instead of KernelFunc invocation.
   template <typename KernelName, typename ElementType, typename KernelType>
   _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)
-  void submitParallelFor(const KernelType KernelFunc) {
+  void submitParallelFor(const KernelType &KernelFunc) {
 #ifdef __SYCL_DEVICE_ONLY__
     KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
 #endif
diff --git a/libsycl/include/sycl/__spirv/spirv_vars.hpp b/libsycl/include/sycl/__spirv/spirv_vars.hpp
index ec8c691b35e92..2c93e510565b3 100644
--- a/libsycl/include/sycl/__spirv/spirv_vars.hpp
+++ b/libsycl/include/sycl/__spirv/spirv_vars.hpp
@@ -21,14 +21,10 @@
 #  include <cstdint>
 
 // SPIR-V built-in variables mapped to function call.
-#  define _LIBSYCL_SYCL_DEVICE_ATTR __attribute__((sycl_external))
 
-_LIBSYCL_SYCL_DEVICE_ATTR __attribute__((const)) size_t
-__spirv_BuiltInGlobalInvocationId(int);
-_LIBSYCL_SYCL_DEVICE_ATTR __attribute__((const)) size_t
-__spirv_BuiltInGlobalSize(int);
-_LIBSYCL_SYCL_DEVICE_ATTR __attribute__((const)) size_t
-__spirv_BuiltInGlobalOffset(int);
+__attribute__((const)) size_t __spirv_BuiltInGlobalInvocationId(int);
+__attribute__((const)) size_t __spirv_BuiltInGlobalSize(int);
+__attribute__((const)) size_t __spirv_BuiltInGlobalOffset(int);
 
 namespace __spirv {
 
diff --git a/libsycl/include/sycl/sycl.hpp b/libsycl/include/sycl/sycl.hpp
index ce9fc8defd90b..7e81d952bd41c 100644
--- a/libsycl/include/sycl/sycl.hpp
+++ b/libsycl/include/sycl/sycl.hpp
@@ -19,6 +19,7 @@
 #include <sycl/__impl/device_selector.hpp>
 #include <sycl/__impl/event.hpp>
 #include <sycl/__impl/exception.hpp>
+#include <sycl/__impl/index_space_classes.hpp>
 #include <sycl/__impl/platform.hpp>
 #include <sycl/__impl/queue.hpp>
 #include <sycl/__impl/usm_functions.hpp>
diff --git a/libsycl/src/detail/queue_impl.cpp b/libsycl/src/detail/queue_impl.cpp
index 93a1f43d25bf6..623b326637932 100644
--- a/libsycl/src/detail/queue_impl.cpp
+++ b/libsycl/src/detail/queue_impl.cpp
@@ -20,6 +20,7 @@ namespace detail {
 
 static void setKernelLaunchArgs(const detail::UnifiedRangeView &Range,
                                 ol_kernel_launch_size_args_t &ArgsToSet) {
+  assert(Range.MDims < 4 && "Invalid dimensions.");
   uint32_t GlobalSize[3] = {1, 1, 1};
   if (Range.MGlobalSize) {
     for (size_t I = 0; I < Range.MDims; ++I) {
diff --git a/libsycl/src/detail/queue_impl.hpp b/libsycl/src/detail/queue_impl.hpp
index a504c467e3927..8800464e96612 100644
--- a/libsycl/src/detail/queue_impl.hpp
+++ b/libsycl/src/detail/queue_impl.hpp
@@ -21,7 +21,6 @@
 #include <OffloadAPI.h>
 
 #include <memory>
-#include <mutex>
 
 _LIBSYCL_BEGIN_NAMESPACE_SYCL
 namespace detail {
diff --git a/libsycl/test/basic/wrapped_usm_pointers.cpp b/libsycl/test/basic/wrapped_usm_pointers.cpp
index 16a86963cc976..c936dcada4a6b 100644
--- a/libsycl/test/basic/wrapped_usm_pointers.cpp
+++ b/libsycl/test/basic/wrapped_usm_pointers.cpp
@@ -90,13 +90,13 @@ int main() {
   };
 
   bool Fail = false;
-  Fail = Checker(SimpleObj);
-  Fail = Checker(NonTrivialObj);
-  Fail = Checker(NonTrivialDerivedObj);
-  Fail = Checker(WrapperOfSimpleObj.Obj);
+  Fail |= Checker(SimpleObj);
+  Fail |= Checker(NonTrivialObj);
+  Fail |= Checker(NonTrivialDerivedObj);
+  Fail |= Checker(WrapperOfSimpleObj.Obj);
 
   for (int i = 0; i < NumOfElements; ++i)
-    Fail = Checker(SimpleArr[i]);
+    Fail |= Checker(SimpleArr[i]);
 
   // Free allocated memory.
   sycl::free(NonTrivialObj.Data, Q);

>From b05c6d1017482699ef865187d0c679a870777676 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova at intel.com>
Date: Wed, 22 Apr 2026 09:12:44 -0700
Subject: [PATCH 6/7] fix comments

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>
---
 .../sycl/__impl/detail/kernel_arg_helpers.hpp |  7 +-
 libsycl/include/sycl/__impl/queue.hpp         | 13 ++-
 libsycl/include/sycl/__spirv/spirv_vars.hpp   | 46 ++++-----
 libsycl/test/basic/parallel_for_indexers.cpp  | 98 +++++++++++++++++++
 4 files changed, 127 insertions(+), 37 deletions(-)
 create mode 100644 libsycl/test/basic/parallel_for_indexers.cpp

diff --git a/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp b/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp
index f3d733981922a..d58df91f19465 100644
--- a/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp
+++ b/libsycl/include/sycl/__impl/detail/kernel_arg_helpers.hpp
@@ -18,9 +18,7 @@
 
 #include <sycl/__impl/detail/config.hpp>
 
-#ifdef __SYCL_DEVICE_ONLY__
-#  include <sycl/__spirv/spirv_vars.hpp>
-#endif
+#include <sycl/__spirv/spirv_vars.hpp>
 
 #include <type_traits>
 
@@ -121,7 +119,6 @@ class Builder {
 public:
   Builder() = delete;
 
-#ifdef __SYCL_DEVICE_ONLY__
   /// \return a global index of work item currently being operated on by device.
   template <int Dims> static const id<Dims> getElement(id<Dims> *) {
     static_assert(isValidDimensions<Dims>, "invalid dimensions");
@@ -181,8 +178,6 @@ class Builder {
       -> decltype(getItem<Dims, WithOffset>()) {
     return getItem<Dims, WithOffset>();
   }
-
-#endif // __SYCL_DEVICE_ONLY__
 };
 
 } // namespace detail
diff --git a/libsycl/include/sycl/__impl/queue.hpp b/libsycl/include/sycl/__impl/queue.hpp
index ea96ee03da5ee..776cbabfe9c8e 100644
--- a/libsycl/include/sycl/__impl/queue.hpp
+++ b/libsycl/include/sycl/__impl/queue.hpp
@@ -323,13 +323,19 @@ class _LIBSYCL_EXPORT queue {
         std::decay_t<detail::nth_type_t<sizeof...(Rest) - 1, Rest...>>;
     using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
     static_assert(
-        std::is_convertible_v<sycl::item<Dims>, LambdaArgType>,
+        std::is_convertible_v<sycl::item<Dims>, LambdaArgType> ||
+            std::is_convertible_v<sycl::item<Dims, false>, LambdaArgType>,
         "Kernel argument of a sycl::parallel_for with sycl::range "
         "must be either sycl::item or be convertible from sycl::item");
+    using TranformedLambdaArgType = std::conditional_t<
+        std::is_convertible_v<item<Dims>, LambdaArgType>, item<Dims>,
+        std::conditional_t<
+            std::is_convertible_v<item<Dims, false>, LambdaArgType>,
+            item<Dims, false>, LambdaArgType>>;
 
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
-    submitParallelFor<NameT, item<Dims>, KernelType>(rest...);
+    submitParallelFor<NameT, TranformedLambdaArgType, KernelType>(rest...);
     return getLastEvent();
   }
 
@@ -376,10 +382,7 @@ class _LIBSYCL_EXPORT queue {
   template <typename KernelName, typename ElementType, typename KernelType>
   _LIBSYCL_ENTRY_POINT_ATTR__(KernelName)
   void submitParallelFor(const KernelType &KernelFunc) {
-#ifdef __SYCL_DEVICE_ONLY__
     KernelFunc(detail::Builder::getElement(detail::declptr<ElementType>()));
-#endif
-    (void)KernelFunc;
   }
 #undef _LIBSYCL_ENTRY_POINT_ATTR__
 
diff --git a/libsycl/include/sycl/__spirv/spirv_vars.hpp b/libsycl/include/sycl/__spirv/spirv_vars.hpp
index 2c93e510565b3..c8d2c990d76c3 100644
--- a/libsycl/include/sycl/__spirv/spirv_vars.hpp
+++ b/libsycl/include/sycl/__spirv/spirv_vars.hpp
@@ -15,8 +15,6 @@
 #ifndef _LIBSYCL___SPIRV_SPIRV_VARS
 #define _LIBSYCL___SPIRV_SPIRV_VARS
 
-#ifdef __SYCL_DEVICE_ONLY__
-
 #  include <cstddef>
 #  include <cstdint>
 
@@ -30,33 +28,31 @@ namespace __spirv {
 
 // Helper function templates to initialize and get vector component from SPIR-V
 // built-in variables
-#  define __SPIRV_DEFINE_INIT_AND_GET_HELPERS(POSTFIX)                         \
-    template <int ID> size_t get##POSTFIX();                                   \
-    template <> size_t get##POSTFIX<0>() { return __spirv_##POSTFIX(0); }      \
-    template <> size_t get##POSTFIX<1>() { return __spirv_##POSTFIX(1); }      \
-    template <> size_t get##POSTFIX<2>() { return __spirv_##POSTFIX(2); }      \
+#define __SPIRV_DEFINE_INIT_AND_GET_HELPERS(POSTFIX)                           \
+  template <int ID> size_t get##POSTFIX();                                     \
+  template <> inline size_t get##POSTFIX<0>() { return __spirv_##POSTFIX(0); } \
+  template <> inline size_t get##POSTFIX<1>() { return __spirv_##POSTFIX(1); } \
+  template <> inline size_t get##POSTFIX<2>() { return __spirv_##POSTFIX(2); } \
                                                                                \
-    template <int Dim, class DstT> struct InitSizesST##POSTFIX;                \
+  template <int Dim, class DstT> struct InitSizesST##POSTFIX;                  \
                                                                                \
-    template <class DstT> struct InitSizesST##POSTFIX<1, DstT> {               \
-      static DstT initSize() { return {get##POSTFIX<0>()}; }                   \
-    };                                                                         \
+  template <class DstT> struct InitSizesST##POSTFIX<1, DstT> {                 \
+    static DstT initSize() { return {get##POSTFIX<0>()}; }                     \
+  };                                                                           \
                                                                                \
-    template <class DstT> struct InitSizesST##POSTFIX<2, DstT> {               \
-      static DstT initSize() {                                                 \
-        return {get##POSTFIX<1>(), get##POSTFIX<0>()};                         \
-      }                                                                        \
-    };                                                                         \
+  template <class DstT> struct InitSizesST##POSTFIX<2, DstT> {                 \
+    static DstT initSize() { return {get##POSTFIX<1>(), get##POSTFIX<0>()}; }  \
+  };                                                                           \
                                                                                \
-    template <class DstT> struct InitSizesST##POSTFIX<3, DstT> {               \
-      static DstT initSize() {                                                 \
-        return {get##POSTFIX<2>(), get##POSTFIX<1>(), get##POSTFIX<0>()};      \
-      }                                                                        \
-    };                                                                         \
+  template <class DstT> struct InitSizesST##POSTFIX<3, DstT> {                 \
+    static DstT initSize() {                                                   \
+      return {get##POSTFIX<2>(), get##POSTFIX<1>(), get##POSTFIX<0>()};        \
+    }                                                                          \
+  };                                                                           \
                                                                                \
-    template <int Dims, class DstT> DstT init##POSTFIX() {                     \
-      return InitSizesST##POSTFIX<Dims, DstT>::initSize();                     \
-    }
+  template <int Dims, class DstT> DstT init##POSTFIX() {                       \
+    return InitSizesST##POSTFIX<Dims, DstT>::initSize();                       \
+  }
 
 __SPIRV_DEFINE_INIT_AND_GET_HELPERS(BuiltInGlobalSize);
 __SPIRV_DEFINE_INIT_AND_GET_HELPERS(BuiltInGlobalInvocationId)
@@ -66,6 +62,4 @@ __SPIRV_DEFINE_INIT_AND_GET_HELPERS(BuiltInGlobalOffset)
 
 } // namespace __spirv
 
-#endif //__SYCL_DEVICE_ONLY__
-
 #endif // _LIBSYCL___SPIRV_SPIRV_VARS
diff --git a/libsycl/test/basic/parallel_for_indexers.cpp b/libsycl/test/basic/parallel_for_indexers.cpp
new file mode 100644
index 0000000000000..e9cef87e7472b
--- /dev/null
+++ b/libsycl/test/basic/parallel_for_indexers.cpp
@@ -0,0 +1,98 @@
+// REQUIRES: any-device
+// RUN: %clangxx -fsycl -Wno-error=deprecated-declarations %s -o %t.out
+// RUN: %t.out
+
+#include <sycl/sycl.hpp>
+
+#include <cassert>
+#include <memory>
+
+using namespace sycl;
+
+// TODO: original test works with buffers, revert changes to USM once they are
+// implemented.
+// TODO add cases with dimensions more than 1
+int main() {
+  bool Fail{};
+
+  constexpr size_t DataSize = 10;
+  const range<1> globalRange(6);
+  // Id indexer
+  {
+    queue Q;
+    int *Data = sycl::malloc_shared<int>(DataSize, Q);
+    for (size_t i = 0; i < DataSize; ++i)
+      Data[i] = -1;
+
+    Q.parallel_for<class id1>(globalRange,
+                              [=](id<1> index) { Data[index] = index[0]; });
+    Q.wait();
+
+    for (size_t i = 0; i < DataSize; i++) {
+      const int id = Data[i];
+      if (i < globalRange[0]) {
+        Fail |= !(id == i);
+      } else {
+        Fail |= !(id == -1);
+      }
+    }
+
+    free(Data, Q);
+  }
+  // print and return;
+
+  // Item indexer without offset
+  {
+    // TODO: replace strcut with sycl::int2 once implemented.
+    struct DoubleInt {
+      int First;
+      int Second;
+    };
+    queue Q;
+    DoubleInt *Data = sycl::malloc_shared<DoubleInt>(DataSize, Q);
+    for (size_t i = 0; i < DataSize; ++i)
+      Data[i] = {-1, -1};
+
+    Q.parallel_for<class item1_nooffset>(
+        globalRange, [=](item<1, false> index) {
+          Data[index.get_id()] = {int(index.get_id()[0]),
+                                  int(index.get_range()[0])};
+        });
+    Q.wait();
+    for (size_t i = 0; i < DataSize; ++i) {
+      const int id = Data[i].First;
+      const int range = Data[i].Second;
+      if (i < globalRange[0]) {
+        Fail |= !(id == i);
+        Fail |= !(range == globalRange[0]);
+      } else {
+        Fail |= !(id == -1);
+        Fail |= !(range == -1);
+      }
+    }
+    free(Data, Q);
+  }
+
+  // get_linear_id()
+  {
+    queue Q;
+    size_t DataSize3D = DataSize * DataSize * DataSize;
+    int *Data = sycl::malloc_shared<int>(DataSize3D, Q);
+    Q.parallel_for(range<3>(DataSize, DataSize, DataSize), [=](item<3> Idx) {
+      auto Id = Idx.get_linear_id();
+      Data[Id] = Id;
+    });
+    Q.wait();
+    for (size_t i = 0; i < DataSize3D; ++i) {
+      Fail |= !(Data[i] == i);
+    }
+    free(Data, Q);
+  }
+
+  // TODO:  Item indexer with offset
+  // blocked by liboffload support
+  // blocked by absence of sycl::handler implementation
+
+  // TODO: add nd_item check
+  return Fail;
+}

>From 51cd8902bba241421f4ee188e3f944c7f5c0aa94 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova at intel.com>
Date: Thu, 23 Apr 2026 04:05:03 -0700
Subject: [PATCH 7/7] fix tests

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova at intel.com>
---
 .../sycl/__impl/index_space_classes.hpp       |  3 +-
 libsycl/include/sycl/__spirv/spirv_vars.hpp   |  6 +-
 libsycl/test/basic/parallel_for_indexers.cpp  | 75 ++++++++-----------
 .../test/basic/queue_parallel_for_generic.cpp | 34 ++++++---
 4 files changed, 61 insertions(+), 57 deletions(-)

diff --git a/libsycl/include/sycl/__impl/index_space_classes.hpp b/libsycl/include/sycl/__impl/index_space_classes.hpp
index 0dc8e90decc3d..d47f803886235 100644
--- a/libsycl/include/sycl/__impl/index_space_classes.hpp
+++ b/libsycl/include/sycl/__impl/index_space_classes.hpp
@@ -19,6 +19,7 @@
 
 #include <cstddef>
 #include <type_traits>
+#include <variant>
 
 _LIBSYCL_BEGIN_NAMESPACE_SYCL
 
@@ -401,7 +402,7 @@ template <int Dimensions /* = 1*/, bool WithOffset /* = true*/> class item {
 private:
   range<Dimensions> MRange;
   id<Dimensions> MId;
-  id<Dimensions> MOffset;
+  std::conditional_t<WithOffset, id<Dimensions>, std::monostate> MOffset;
 
   friend class detail::Builder;
 };
diff --git a/libsycl/include/sycl/__spirv/spirv_vars.hpp b/libsycl/include/sycl/__spirv/spirv_vars.hpp
index c8d2c990d76c3..450f581d9506d 100644
--- a/libsycl/include/sycl/__spirv/spirv_vars.hpp
+++ b/libsycl/include/sycl/__spirv/spirv_vars.hpp
@@ -15,8 +15,8 @@
 #ifndef _LIBSYCL___SPIRV_SPIRV_VARS
 #define _LIBSYCL___SPIRV_SPIRV_VARS
 
-#  include <cstddef>
-#  include <cstdint>
+#include <cstddef>
+#include <cstdint>
 
 // SPIR-V built-in variables mapped to function call.
 
@@ -58,7 +58,7 @@ __SPIRV_DEFINE_INIT_AND_GET_HELPERS(BuiltInGlobalSize);
 __SPIRV_DEFINE_INIT_AND_GET_HELPERS(BuiltInGlobalInvocationId)
 __SPIRV_DEFINE_INIT_AND_GET_HELPERS(BuiltInGlobalOffset)
 
-#  undef __SPIRV_DEFINE_INIT_AND_GET_HELPERS
+#undef __SPIRV_DEFINE_INIT_AND_GET_HELPERS
 
 } // namespace __spirv
 
diff --git a/libsycl/test/basic/parallel_for_indexers.cpp b/libsycl/test/basic/parallel_for_indexers.cpp
index e9cef87e7472b..078508f53fff5 100644
--- a/libsycl/test/basic/parallel_for_indexers.cpp
+++ b/libsycl/test/basic/parallel_for_indexers.cpp
@@ -11,12 +11,11 @@ using namespace sycl;
 
 // TODO: original test works with buffers, revert changes to USM once they are
 // implemented.
-// TODO add cases with dimensions more than 1
 int main() {
   bool Fail{};
 
   constexpr size_t DataSize = 10;
-  const range<1> globalRange(6);
+  const range<1> GlobalRange(6);
   // Id indexer
   {
     queue Q;
@@ -24,29 +23,31 @@ int main() {
     for (size_t i = 0; i < DataSize; ++i)
       Data[i] = -1;
 
-    Q.parallel_for<class id1>(globalRange,
-                              [=](id<1> index) { Data[index] = index[0]; });
+    Q.parallel_for<class id1>(GlobalRange,
+                              [=](id<1> Index) { Data[Index] = Index[0]; });
     Q.wait();
 
-    for (size_t i = 0; i < DataSize; i++) {
-      const int id = Data[i];
-      if (i < globalRange[0]) {
-        Fail |= !(id == i);
-      } else {
-        Fail |= !(id == -1);
+    Fail |= [&]() {
+      for (size_t i = 0; i < DataSize; ++i) {
+        const int ExpectedVal = i < GlobalRange[0] ? i : -1;
+        if (Data[i] != ExpectedVal) {
+          std::cout << "line: " << __LINE__ << " Data[" << i << "] is "
+                    << Data[i] << " expected " << ExpectedVal << std::endl;
+          return true;
+        }
       }
-    }
+      return false;
+    }();
 
     free(Data, Q);
   }
-  // print and return;
 
   // Item indexer without offset
   {
     // TODO: replace strcut with sycl::int2 once implemented.
     struct DoubleInt {
-      int First;
-      int Second;
+      int Id;
+      int Range;
     };
     queue Q;
     DoubleInt *Data = sycl::malloc_shared<DoubleInt>(DataSize, Q);
@@ -54,38 +55,26 @@ int main() {
       Data[i] = {-1, -1};
 
     Q.parallel_for<class item1_nooffset>(
-        globalRange, [=](item<1, false> index) {
-          Data[index.get_id()] = {int(index.get_id()[0]),
-                                  int(index.get_range()[0])};
+        GlobalRange, [=](item<1, false> Index) {
+          Data[Index.get_id()] = {int(Index.get_id()[0]),
+                                  int(Index.get_range()[0])};
         });
     Q.wait();
-    for (size_t i = 0; i < DataSize; ++i) {
-      const int id = Data[i].First;
-      const int range = Data[i].Second;
-      if (i < globalRange[0]) {
-        Fail |= !(id == i);
-        Fail |= !(range == globalRange[0]);
-      } else {
-        Fail |= !(id == -1);
-        Fail |= !(range == -1);
-      }
-    }
-    free(Data, Q);
-  }
 
-  // get_linear_id()
-  {
-    queue Q;
-    size_t DataSize3D = DataSize * DataSize * DataSize;
-    int *Data = sycl::malloc_shared<int>(DataSize3D, Q);
-    Q.parallel_for(range<3>(DataSize, DataSize, DataSize), [=](item<3> Idx) {
-      auto Id = Idx.get_linear_id();
-      Data[Id] = Id;
-    });
-    Q.wait();
-    for (size_t i = 0; i < DataSize3D; ++i) {
-      Fail |= !(Data[i] == i);
-    }
+    Fail |= [&]() {
+      for (size_t i = 0; i < DataSize; ++i) {
+        const int ExpectedValID = i < GlobalRange[0] ? i : -1;
+        const int ExpectedValRange = i < GlobalRange[0] ? GlobalRange[0] : -1;
+        if (Data[i].Id != ExpectedValID || Data[i].Range != ExpectedValRange) {
+          std::cout << "line: " << __LINE__ << " Data[" << i << "] is {"
+                    << Data[i].Id << ", " << Data[i].Range << "} expected {"
+                    << ExpectedValID << ", " << ExpectedValRange << "}"
+                    << std::endl;
+          return true;
+        }
+      }
+      return false;
+    }();
     free(Data, Q);
   }
 
diff --git a/libsycl/test/basic/queue_parallel_for_generic.cpp b/libsycl/test/basic/queue_parallel_for_generic.cpp
index cac423b85f218..70a191e6ab220 100644
--- a/libsycl/test/basic/queue_parallel_for_generic.cpp
+++ b/libsycl/test/basic/queue_parallel_for_generic.cpp
@@ -12,36 +12,50 @@ int main() {
   // TODO: uncomment property once it is implemented. now all sycl::queue
   // objects are in-order due to liboffload limitation. Test is intended to
   // check in-order execution.
-  sycl::queue q{/*sycl::property::queue::in_order()*/};
-  auto dev = q.get_device();
-  auto ctx = q.get_context();
+  sycl::queue Q{/*sycl::property::queue::in_order()*/};
+  auto Dev = Q.get_device();
+  auto Ctx = Q.get_context();
   constexpr int N = 8;
 
-  auto A = static_cast<int *>(sycl::malloc_shared(N * sizeof(int), dev, ctx));
+  auto A = static_cast<int *>(sycl::malloc_shared(N * sizeof(int), Dev, Ctx));
 
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     A[i] = 1;
   }
 
-  q.parallel_for<class Bar>(N, [=](auto i) {
+  Q.parallel_for<class IntRange>(N, [=](auto i) {
     static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
                   "lambda arg type is unexpected");
     A[i]++;
   });
 
-  q.parallel_for<class Foo>({N}, [=](auto i) {
+  Q.parallel_for<class InitRange>({N}, [=](auto i) {
     static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
                   "lambda arg type is unexpected");
     A[i]++;
   });
 
+  Q.parallel_for<class InitRange2D>({4, 2}, [=](auto i) {
+    static_assert(std::is_same<decltype(i), sycl::item<2>>::value,
+                  "lambda arg type is unexpected");
+    A[i.get_linear_id()]++;
+  });
+
+  Q.parallel_for<class InitRange3D>({2, 2, 2}, [=](auto i) {
+    static_assert(std::is_same<decltype(i), sycl::item<3>>::value,
+                  "lambda arg type is unexpected");
+    A[i.get_linear_id()]++;
+  });
+
   // TODO: add kernel with offset and kernel with nd_range once they
   // are implemented.
 
-  q.wait();
+  Q.wait();
 
+  bool Fail{};
   for (int i = 0; i < N; i++) {
-    assert(A[i] == 3);
+    Fail |= !(A[i] == 5);
   }
-  sycl::free(A, ctx);
+  sycl::free(A, Ctx);
+  return Fail;
 }