[libcxx] [libcxxabi] [llvm] Adding OpenMP Offloading Backend for C++ Parallel Algorithms (Rebased #66968) (PR #122180)

Wed Jan 8 14:25:27 PST 2025

https://github.com/jdoerfert updated https://github.com/llvm/llvm-project/pull/122180

>From a6c129d1c9e58ada81d2a43bd15501084a78f2ca Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 20 Sep 2023 17:06:10 -0700
Subject: [PATCH] Adding OpenMP Offloading Backend for C++ Parallel Algorithms
 Rebased

---
 .github/workflows/libcxx-build-and-test.yaml  |   1 +
 libcxx/CMakeLists.txt                         |  12 +-
 libcxx/cmake/caches/Generic-pstl-openmp.cmake |   1 +
 libcxx/docs/UserDocumentation.rst             | 101 ++++
 libcxx/docs/VendorDocumentation.rst           |  11 +
 libcxx/include/CMakeLists.txt                 |   1 +
 libcxx/include/__config_site.in               |   1 +
 libcxx/include/__pstl/backend.h               |   4 +
 libcxx/include/__pstl/backend_fwd.h           |   4 +
 libcxx/include/__pstl/backends/openmp.h       | 531 ++++++++++++++++++
 libcxx/include/__pstl/dispatch.h              |  15 +
 libcxx/include/module.modulemap               |   5 +
 libcxx/test/configs/cmake-bridge.cfg.in       |   1 +
 .../alg.pstl.openmp/fill_offload.pass.cpp     |  53 ++
 .../alg.pstl.openmp/find_if.pass.cpp          |  70 +++
 .../alg.pstl.openmp/find_if_funptr.pass.cpp   |  39 ++
 .../alg.pstl.openmp/find_if_offload.pass.cpp  |  42 ++
 .../alg.pstl.openmp/for_each_funptr.pass.cpp  |  39 ++
 .../alg.pstl.openmp/for_each_lambda.pass.cpp  |  52 ++
 .../alg.pstl.openmp/for_each_offload.pass.cpp |  42 ++
 .../for_each_overwrite_input.pass.cpp         |  66 +++
 .../gpu_environment_variables.pass.cpp        |  53 ++
 .../openmp_version_40.verify.cpp              |  21 +
 .../openmp_version_45.verify.cpp              |  21 +
 .../openmp_version_51.verify.cpp              |  21 +
 .../transform_offload.pass.cpp                |  58 ++
 .../transform_reduce_offload.pass.cpp         |  45 ++
 ...educe_supported_binary_operations.pass.cpp | 203 +++++++
 libcxx/utils/libcxx/test/features.py          |  18 +
 libcxx/utils/run.py                           |  18 +
 libcxxabi/CMakeLists.txt                      |   8 +
 31 files changed, 1555 insertions(+), 2 deletions(-)
 create mode 100644 libcxx/cmake/caches/Generic-pstl-openmp.cmake
 create mode 100644 libcxx/include/__pstl/backends/openmp.h
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index a28bf4d5daf6d5d..21d3f380b72c13f 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -146,6 +146,7 @@ jobs:
           'generic-no-wide-characters',
           'generic-no-rtti',
           'generic-optimized-speed',
+          'generic-pstl-openmp',
           'generic-static',
           'bootstrapping-build'
         ]
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index abe12c2805a7cf8..dee2a75f74d89f0 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -300,10 +300,11 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
 
 if (LIBCXX_ENABLE_THREADS)
-  set(LIBCXX_PSTL_BACKEND "std_thread" CACHE STRING "Which PSTL backend to use")
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "std_thread")
 else()
-  set(LIBCXX_PSTL_BACKEND "serial" CACHE STRING "Which PSTL backend to use")
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "serial")
 endif()
+set(LIBCXX_PSTL_BACKEND "${LIBCXX_PSTL_BACKEND_DEFAULT}" CACHE STRING "Select the PSTL backend to use. Valid values are serial, std-thread, libdispatch, openmp. Default: ${LIBCXX_PSTL_BACKEND_DEFAULT}")
 
 # Misc options ----------------------------------------------------------------
 # FIXME: Turn -pedantic back ON. It is currently off because it warns
@@ -552,6 +553,11 @@ function(cxx_add_basic_build_flags target)
     endif()
   endif()
   target_compile_options(${target} PUBLIC "${LIBCXX_ADDITIONAL_COMPILE_FLAGS}")
+
+  # If the PSTL backend depends on OpenMP, we must enable the OpenMP tool chain
+  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+    target_add_compile_flags_if_supported(${target} PUBLIC -fopenmp)
+  endif()
 endfunction()
 
 # Exception flags =============================================================
@@ -784,6 +790,8 @@ elseif(LIBCXX_PSTL_BACKEND STREQUAL "std_thread")
   config_define(1 _LIBCPP_PSTL_BACKEND_STD_THREAD)
 elseif(LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   config_define(1 _LIBCPP_PSTL_BACKEND_LIBDISPATCH)
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+  config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
 else()
   message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
                        Valid backends are: serial, std_thread and libdispatch")
diff --git a/libcxx/cmake/caches/Generic-pstl-openmp.cmake b/libcxx/cmake/caches/Generic-pstl-openmp.cmake
new file mode 100644
index 000000000000000..f3ff4f3b57fd212
--- /dev/null
+++ b/libcxx/cmake/caches/Generic-pstl-openmp.cmake
@@ -0,0 +1 @@
+set(LIBCXX_PSTL_BACKEND openmp CACHE STRING "")
diff --git a/libcxx/docs/UserDocumentation.rst b/libcxx/docs/UserDocumentation.rst
index 2c1bc1373659c39..2e38963a36804c3 100644
--- a/libcxx/docs/UserDocumentation.rst
+++ b/libcxx/docs/UserDocumentation.rst
@@ -329,6 +329,107 @@ and as such, libc++ does not go out of its way to support them. The library may
 compiler extensions which would then be documented explicitly, but the basic expectation should be
 that no special support is provided for arbitrary compiler extensions.
 
+Offloading C++ Parallel Algorithms to GPUs
+------------------------------------------
+
+Experimental support for GPU offloading has been added to ``libc++``. The
+implementation uses OpenMP target offloading to leverage GPU compute resources.
+The OpenMP PSTL backend can target both NVIDIA and AMD GPUs.
+However, the implementation only supports contiguous iterators, such as
+iterators for ``std::vector`` or ``std::array``.
+To enable the OpenMP offloading backend it must be selected with
+``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when
+compiling a program, the user must specify the command line options
+``-fopenmp -fexperimental-library``. To install LLVM with OpenMP offloading
+enabled, please read
+`the LLVM OpenMP FAQ. <https://openmp.llvm.org/SupportAndFAQ.html>`_
+You may also want to to visit
+`the OpenMP offloading command-line argument reference. <https://openmp.llvm.org/CommandLineArgumentReference.html#offload-command-line-arguments>`_
+
+Example
+~~~~~~~
+
+The following is an example of offloading vector addition to a GPU using our
+standard library extension. It implements the classical vector addition from
+BLAS that overwrites the vector ``y`` with ``y=a*x+y``. Thus ``y.begin()`` is
+both used as an input and an output iterator in this example.
+
+.. code-block:: cpp
+
+  #include <algorithm>
+  #include <execution>
+
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [=](T2 xi, T3 yi) { return a * xi + yi; });
+  }
+
+The execution policy ``std::execution::par_unseq`` states that the algorithm's
+execution may be parallelized, vectorized, and migrated across threads. This is
+the only execution mode that is safe to offload to GPUs, and for all other
+execution modes the algorithms will execute on the CPU.
+Special attention must be paid to the lambda captures when enabling GPU
+offloading. If the lambda captures by reference, the user must manually map the
+variables to the device. If capturing by reference, the above example could
+be implemented in the following way.
+
+.. code-block:: cpp
+
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+  #pragma omp target data map(to : a)
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [&](T2 xi, T3 yi) { return a * xi + yi; });
+  }
+
+However, if unified shared memory, USM, is enabled, no additional data mapping
+is necessary when capturing y reference.
+
+Compiling functions for GPUs with OpenMP
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The C++ standard defines that all accesses to memory are inside a single address
+space. However, discrete GPU systems have distinct address spaces. A single
+address space can be emulated if your system supports unified shared memory.
+However, many discrete GPU systems do not, and in those cases it is important to
+pass device function pointers to the parallel algorithms. Below is an example of
+how the OpenMP ``declare target`` directive with the ``indirect`` clause can be
+used to mark that a function should be compiled for both host and device.
+
+.. code-block:: cpp
+
+  // This function computes the squared difference of two floating points
+  float squared(float a, float b) { return a * a - 2.0f * a * b + b * b; };
+
+  // Declare that the function must be compiled for both host and device
+  #pragma omp declare target indirect to(squared)
+
+  int main() {
+    std::vector<float> a(100, 1.0);
+    std::vector<float> b(100, 1.25);
+
+    // Pass the host function pointer to the parallel algorithm and let OpenMP
+    // translate it to the device function pointer internally
+    float sum =
+        std::transform_reduce(std::execution::par_unseq, a.begin(), a.end(),
+                              b.begin(), 0.0f, std::plus{}, squared);
+
+    // Validate that the result is approximately 6.25
+    assert(std::abs(sum - 6.25f) < 1e-10);
+    return 0;
+  }
+
+Without unified shared memory, the above example will not work if the host
+function pointer ``squared`` is passed to the parallel algorithm.
+
+Important notes about exception handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+GPU architectures do not support exception handling and, for now,
+``-fno-exceptions`` is required to offload to the GPU. Parallel CPU fallback 
+is available without restrictions.
+
 Platform specific behavior
 ==========================
 
diff --git a/libcxx/docs/VendorDocumentation.rst b/libcxx/docs/VendorDocumentation.rst
index 959a28607d75ddf..6d1bd87c2378be5 100644
--- a/libcxx/docs/VendorDocumentation.rst
+++ b/libcxx/docs/VendorDocumentation.rst
@@ -264,6 +264,17 @@ General purpose options
   default assertion handler. If this is specified as a relative path, it
   is assumed to be relative to ``<monorepo>/libcxx``.
 
+.. option:: LIBCXX_PSTL_BACKEND:STRING
+
+  **Default**:: ``"serial"``
+
+  **Values**:: ``serial``, ``std-thread``, ``libdispatch``, ``openmp``
+
+  Select the desired backend for C++ parallel algorithms. All four options can
+  target multi-core CPU architectures, and ``openmp`` can additionally target
+  GPU architectures. The ``openmp`` backend requires OpenMP version 4.5 or
+  later (clang's default is sufficient).
+
 ABI Specific Options
 --------------------
 
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 0b484ebe5e87c84..3326db745c54f20 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -613,6 +613,7 @@ set(files
   __pstl/backend_fwd.h
   __pstl/backends/default.h
   __pstl/backends/libdispatch.h
+  __pstl/backends/openmp.h
   __pstl/backends/serial.h
   __pstl/backends/std_thread.h
   __pstl/cpu_algos/any_of.h
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index fc01aaf2d8746ea..fa1c99264514c03 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -38,6 +38,7 @@
 #cmakedefine _LIBCPP_PSTL_BACKEND_SERIAL
 #cmakedefine _LIBCPP_PSTL_BACKEND_STD_THREAD
 #cmakedefine _LIBCPP_PSTL_BACKEND_LIBDISPATCH
+#cmakedefine _LIBCPP_PSTL_BACKEND_OPENMP
 
 // Hardening.
 #cmakedefine _LIBCPP_HARDENING_MODE_DEFAULT @_LIBCPP_HARDENING_MODE_DEFAULT@
diff --git a/libcxx/include/__pstl/backend.h b/libcxx/include/__pstl/backend.h
index 5980b0708cd340a..c2dab5d42df0a87 100644
--- a/libcxx/include/__pstl/backend.h
+++ b/libcxx/include/__pstl/backend.h
@@ -30,6 +30,10 @@ _LIBCPP_PUSH_MACROS
 #  elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
 #    include <__pstl/backends/default.h>
 #    include <__pstl/backends/libdispatch.h>
+#  elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+#    include <__pstl/backends/default.h>
+#    include <__pstl/backends/openmp.h>
+#    include <__pstl/backends/std_thread.h>
 #  endif
 
 #endif // _LIBCPP_STD_VER >= 17
diff --git a/libcxx/include/__pstl/backend_fwd.h b/libcxx/include/__pstl/backend_fwd.h
index a7d53b6a1c98942..57035f421533460 100644
--- a/libcxx/include/__pstl/backend_fwd.h
+++ b/libcxx/include/__pstl/backend_fwd.h
@@ -49,6 +49,7 @@ struct __backend_configuration;
 
 struct __default_backend_tag;
 struct __libdispatch_backend_tag;
+struct __openmp_backend_tag;
 struct __serial_backend_tag;
 struct __std_thread_backend_tag;
 
@@ -60,6 +61,9 @@ using __current_configuration _LIBCPP_NODEBUG =
 #  elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
 using __current_configuration _LIBCPP_NODEBUG =
     __backend_configuration<__libdispatch_backend_tag, __default_backend_tag>;
+#  elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+using __current_configuration _LIBCPP_NODEBUG =
+    __backend_configuration<__openmp_backend_tag, __std_thread_backend_tag, __default_backend_tag>;
 #  else
 
 // ...New vendors can add parallel backends here...
diff --git a/libcxx/include/__pstl/backends/openmp.h b/libcxx/include/__pstl/backends/openmp.h
new file mode 100644
index 000000000000000..73ce7c4eba43662
--- /dev/null
+++ b/libcxx/include/__pstl/backends/openmp.h
@@ -0,0 +1,531 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___PSTL_BACKENDS_OPENMP_H
+#define _LIBCPP___PSTL_BACKENDS_OPENMP_H
+
+// Combined OpenMP CPU and GPU Backend
+// ===================================
+// Contrary to the CPU backends found in ./cpu_backends/, the OpenMP backend can
+// target both CPUs and GPUs. The OpenMP standard defines that when offloading
+// code to an accelerator, the compiler must generate a fallback code for
+// execution on the host. Thereby, the backend works as a CPU backend if no
+// targeted accelerator is available at execution time. The target regions can
+// also be compiled directly for a CPU architecture, for instance by adding the
+// command-line option `-fopenmp-targets=x86_64-pc-linux-gnu` in Clang.
+//
+// When is an Algorithm Offloaded?
+// -------------------------------
+// Only parallel algorithms with the parallel unsequenced execution policy are
+// offloaded to the device. We cannot offload parallel algorithms with a
+// parallel execution policy to GPUs because invocations executing in the same
+// thread "are indeterminately sequenced with respect to each other" which we
+// cannot guarantee on a GPU.
+//
+// The standard draft states that "the semantics [...] allow the implementation
+// to fall back to sequential execution if the system cannot parallelize an
+// algorithm invocation". If it is not deemed safe to offload the parallel
+// algorithm to the device, we first fall back to a parallel unsequenced
+// implementation from ./cpu_backends. The CPU implementation may then fall back
+// to sequential execution. In that way we strive to achieve the best possible
+// performance.
+//
+// Further, "it is the caller's responsibility to ensure that the invocation
+// does not introduce data races or deadlocks."
+//
+// Implicit Assumptions
+// --------------------
+// If the user provides a function pointer as an argument to a parallel
+// algorithm, it is assumed that it is the device pointer as there is currently
+// no way to check whether a host or device pointer was passed.
+//
+// Mapping Clauses
+// ---------------
+// In some of the parallel algorithms, the user is allowed to provide the same
+// iterator as input and output. The order of the maps matters because OpenMP
+// keeps a reference counter of which variables have been mapped to the device.
+// Thereby, a varible is only copied to the device if its reference counter is
+// incremented from zero, and it is only copied back to the host when the
+// reference counter is decremented to zero again.
+// This allows nesting mapped regions, for instance in recursive functions,
+// without enforcing a lot of unnecessary data movement.
+// Therefore, `pragma omp target data map(to:...)` must be used before
+// `pragma omp target data map(alloc:...)`. Conversely, the maps with map
+// modifier `release` must be placed before the maps with map modifier `from`
+// when transferring the result from the device to the host.
+//
+// Example: Assume `a` and `b` are pointers to the same array.
+// ``` C++
+// #pragma omp target enter data map(alloc:a[0:n])
+// // The reference counter is incremented from 0 to 1. a is not copied to the
+// // device because of the `alloc` map modifier.
+// #pragma omp target enter data map(to:b[0:n])
+// // The reference counter is incremented from 1 to 2. b is not copied because
+// // the reference counter is positive. Therefore b, and a, are uninitialized
+// // on the device.
+// ```
+//
+// Exceptions
+// ----------
+// Currently, GPU architectures do not handle exceptions. OpenMP target regions
+// are allowed to contain try/catch statements and throw expressions in Clang,
+// but if a throw expression is reached, it will terminate the program. That
+// does not conform to the C++ standard.
+//
+// [This document](https://eel.is/c++draft/algorithms.parallel) has been used as
+// reference for these considerations.
+
+#include <__algorithm/unwrap_iter.h>
+#include <__config>
+#include <__functional/operations.h>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/wrap_iter.h>
+#include <__pstl/backend_fwd.h>
+#include <__pstl/dispatch.h>
+#include <__type_traits/desugars_to.h>
+#include <__type_traits/is_arithmetic.h>
+#include <__type_traits/is_trivially_copyable.h>
+#include <__type_traits/remove_cvref.h>
+#include <__utility/empty.h>
+#include <__utility/forward.h>
+#include <__utility/move.h>
+#include <execution>
+#include <optional>
+
+#if !defined(_OPENMP)
+#  error "Trying to use the OpenMP PSTL backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+#elif (defined(_OPENMP) && _OPENMP < 201511)
+#  error                                                                                                               \
+      "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+namespace __pstl {
+
+// The following functions can be used to map contiguous array sections to and from the device.
+// For now, they are simple overlays of the OpenMP pragmas, but they should be updated when adding
+// support for other iterator types.
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_to([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target enter data map(to : __p[0 : __len])
+}
+
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_from([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target exit data map(from : __p[0 : __len])
+}
+
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_alloc([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target enter data map(alloc : __p[0 : __len])
+}
+
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_release([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target exit data map(release : __p[0 : __len])
+}
+
+//
+// fill
+//
+template <class _Tp, class _DifferenceType, class _Up>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_fill(_Tp* __out1, _DifferenceType __n, const _Up& __value) noexcept {
+  __pstl::__omp_map_alloc(__out1, __n);
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wopenmp-mapping"
+#pragma omp target teams distribute parallel for
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__out1 + __i) = __value;
+#pragma clang diagnostic pop
+  __pstl::__omp_map_from(__out1, __n);
+  return __out1 + __n;
+}
+
+template <>
+struct __fill<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+  template <class _Policy, class _ForwardIterator, class _Tp>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty>
+  operator()(_Policy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Tp const& __value) const noexcept {
+    using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_trivially_copyable_v<_ValueType> &&
+                  is_trivially_copyable_v<_Tp>) {
+      __pstl::__omp_fill(std::__unwrap_iter(__first), __last - __first, __value);
+      return __empty{};
+    } else {
+      using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+      using _Fallback = __dispatch<__pstl::__fill, _Backends, __remove_cvref_t<_Policy>>;
+      return _Fallback{}(std::forward<_Policy>(__policy), std::move(__first), std::move(__last), __value);
+    }
+  }
+};
+
+//
+// find_if
+//
+template <class _Tp, class _DifferenceType, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_find_if(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
+  __pstl::__omp_map_to(__first, __n);
+  _DifferenceType __idx = __n;
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wopenmp-mapping"
+#pragma omp target teams distribute parallel for reduction(min : __idx)
+  for (_DifferenceType __i = 0; __i < __n; ++__i) {
+    if (__pred(*(__first + __i))) {
+      __idx = (__i < __idx) ? __i : __idx;
+    }
+  }
+#pragma clang diagnostic pop
+  __pstl::__omp_map_release(__first, __n);
+  return __first + __idx;
+}
+
+template <>
+struct __find_if<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+  template <class _Policy, class _ForwardIterator, class _Predicate>
+  _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
+  operator()(_Policy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) const noexcept {
+    using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_trivially_copyable_v<_ValueType>) {
+      return std::__rewrap_iter(__first, __pstl::__omp_find_if(std::__unwrap_iter(__first), __last - __first, __pred));
+    } else {
+      using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+      using _Fallback = __dispatch<__pstl::__find_if, _Backends, __remove_cvref_t<_Policy>>;
+      return _Fallback{}(std::forward<_Policy>(__policy), std::move(__first), std::move(__last), std::move(__pred));
+    }
+  }
+};
+
+//
+// for_each
+//
+template <class _Tp, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_for_each(_Tp* __inout1, _DifferenceType __n, _Function __f) noexcept {
+  __pstl::__omp_map_to(__inout1, __n);
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wopenmp-mapping"
+#pragma omp target teams distribute parallel for
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(*(__inout1 + __i));
+#pragma clang diagnostic pop
+  __pstl::__omp_map_from(__inout1, __n);
+  return __inout1 + __n;
+}
+
+template <>
+struct __for_each<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+  template <class _Policy, class _ForwardIterator, class _Functor>
+  _LIBCPP_HIDE_FROM_ABI optional<__empty>
+  operator()(_Policy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) const noexcept {
+    using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+                  __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_trivially_copyable_v<_ValueType>) {
+      __pstl::__omp_for_each(std::__unwrap_iter(__first), __last - __first, std::move(__func));
+      return __empty{};
+    } else {
+      using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+      using _Fallback = __dispatch<__pstl::__for_each, _Backends, __remove_cvref_t<_Policy>>;
+      return _Fallback{}(std::forward<_Policy>(__policy), std::move(__first), std::move(__last), std::move(__func));
+    }
+  }
+};
+
+//
+// transform
+//
+template <class _Tp, class _DifferenceType, class _Up, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
+  // The order of the following maps matter, as we wish to move the data. If
+  // they were placed in the reverse order, and __in equals __out, then we would
+  // allocate the buffer on the device without copying the data.
+  __pstl::__omp_map_to(__in1, __n);
+  __pstl::__omp_map_alloc(__out1, __n);
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wopenmp-mapping"
+#pragma omp target teams distribute parallel for
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__out1 + __i) = __f(*(__in1 + __i));
+#pragma clang diagnostic pop
+  // The order of the following two maps matters, since the user could legally
+  // overwrite __in The "release" map modifier decreases the reference counter
+  // by one, and "from" only moves the data to the host, when the reference
+  // count is decremented to zero.
+  __pstl::__omp_map_release(__in1, __n);
+  __pstl::__omp_map_from(__out1, __n);
+  return __out1 + __n;
+}
+
+template <>
+struct __transform<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+  template <class _Policy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
+  _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator>
+  operator()(_Policy&& __policy,
+             _ForwardIterator __first,
+             _ForwardIterator __last,
+             _ForwardOutIterator __outit,
+             _UnaryOperation __op) const noexcept {
+    using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+                  __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value && is_trivially_copyable_v<_ValueType>) {
+      return std::__rewrap_iter(
+          __outit,
+          __omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__outit), std::move(__op)));
+    } else {
+      using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+      using _Fallback = __dispatch<__pstl::__transform, _Backends, __remove_cvref_t<_Policy>>;
+      return _Fallback{}(
+          std::forward<_Policy>(__policy), std::move(__first), std::move(__last), std::move(__outit), std::move(__op));
+    }
+  }
+};
+
+//
+// transform_binary
+//
+template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp*
+__omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Function __f) noexcept {
+  // The order of the following maps matter, as we wish to move the data. If
+  // they were placed in the reverse order, and __out equals __in1 or __in2,
+  // then we would allocate one of the buffer on the device without copying the
+  // data.
+  __pstl::__omp_map_to(__in1, __n);
+  __pstl::__omp_map_to(__in2, __n);
+  __pstl::__omp_map_alloc(__out1, __n);
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wopenmp-mapping"
+#pragma omp target teams distribute parallel for
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__out1 + __i) = __f(*(__in1 + __i), *(__in2 + __i));
+#pragma clang diagnostic pop
+  // The order of the following three maps matters, since the user could legally
+  // overwrite either of the inputs if __out equals __in1 or __in2. The
+  // "release" map modifier decreases the reference counter by one, and "from"
+  // only moves the data from the device, when the reference count is
+  // decremented to zero.
+  __pstl::__omp_map_release(__in1, __n);
+  __pstl::__omp_map_release(__in2, __n);
+  __pstl::__omp_map_from(__out1, __n);
+  return __out1 + __n;
+}
+
+template <>
+struct __transform_binary<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+  template <class _Policy,
+            class _ForwardIterator1,
+            class _ForwardIterator2,
+            class _ForwardOutIterator,
+            class _BinaryOperation>
+  _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator>
+  operator()(_Policy&& __policy,
+             _ForwardIterator1 __first1,
+             _ForwardIterator1 __last1,
+             _ForwardIterator2 __first2,
+             _ForwardOutIterator __outit,
+             _BinaryOperation __op) const noexcept {
+    using _ValueType1 = typename iterator_traits<_ForwardIterator1>::value_type;
+    using _ValueType2 = typename iterator_traits<_ForwardIterator2>::value_type;
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                  __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
+                  __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value && is_trivially_copyable_v<_ValueType1> &&
+                  is_trivially_copyable_v<_ValueType2>) {
+      return std::__rewrap_iter(
+          __outit,
+          __pstl::__omp_transform(
+              std::__unwrap_iter(__first1),
+              __last1 - __first1,
+              std::__unwrap_iter(__first2),
+              std::__unwrap_iter(__outit),
+              std::move(__op)));
+    } else {
+      using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+      using _Fallback = __dispatch<__pstl::__transform_binary, _Backends, __remove_cvref_t<_Policy>>;
+      return _Fallback{}(
+          std::forward<_Policy>(__policy),
+          std::move(__first1),
+          std::move(__last1),
+          std::move(__first2),
+          std::move(__outit),
+          std::move(__op));
+    }
+  }
+};
+
+//
+// transform_reduce
+//
+#define _LIBCPP_PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                              \
+  template <class _Iterator,                                                                                           \
+            class _DifferenceType,                                                                                     \
+            typename _Tp,                                                                                              \
+            typename _BinaryOperationType,                                                                             \
+            typename _UnaryOperation>                                                                                  \
+  _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                                    \
+      _Iterator __first,                                                                                               \
+      _DifferenceType __n,                                                                                             \
+      _Tp __init,                                                                                                      \
+      std_op<_BinaryOperationType> __reduce,                                                                           \
+      _UnaryOperation __transform) noexcept {                                                                          \
+    __pstl::__omp_map_to(__first, __n);                                                                                \
+    _PSTL_PRAGMA(clang diagnostic push)                                                                                \
+    _PSTL_PRAGMA(clang diagnostic ignored "-Wopenmp-mapping")                                                          \
+_PSTL_PRAGMA(omp target teams distribute parallel for reduction(omp_op:__init))                                        \
+    for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                    \
+      __init = __reduce(__init, __transform(*(__first + __i)));                                                        \
+    _PSTL_PRAGMA(clang diagnostic pop)                                                                                 \
+    __pstl::__omp_map_release(__first, __n);                                                                           \
+    return __init;                                                                                                     \
+  }
+
+#define _LIBCPP_PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                              \
+  template <class _Iterator1,                                                                                          \
+            class _Iterator2,                                                                                          \
+            class _DifferenceType,                                                                                     \
+            typename _Tp,                                                                                              \
+            typename _BinaryOperationType,                                                                             \
+            typename _UnaryOperation >                                                                                 \
+  _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                                    \
+      _Iterator1 __first1,                                                                                             \
+      _Iterator2 __first2,                                                                                             \
+      _DifferenceType __n,                                                                                             \
+      _Tp __init,                                                                                                      \
+      std_op<_BinaryOperationType> __reduce,                                                                           \
+      _UnaryOperation __transform) noexcept {                                                                          \
+    __pstl::__omp_map_to(__first1, __n);                                                                               \
+    __pstl::__omp_map_to(__first2, __n);                                                                               \
+    _PSTL_PRAGMA(clang diagnostic push)                                                                                \
+    _PSTL_PRAGMA(clang diagnostic ignored "-Wopenmp-mapping")                                                          \
+_PSTL_PRAGMA(omp target teams distribute parallel for reduction(omp_op:__init))                                        \
+    for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                    \
+      __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                    \
+    _PSTL_PRAGMA(clang diagnostic pop)                                                                                 \
+    __pstl::__omp_map_release(__first1, __n);                                                                          \
+    __pstl::__omp_map_release(__first2, __n);                                                                          \
+    return __init;                                                                                                     \
+  }
+
+#define _LIBCPP_PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                \
+  _LIBCPP_PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                    \
+  _LIBCPP_PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
+
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(+, std::plus)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(-, std::minus)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
+
+// Determine whether a reduction is supported by the OpenMP backend
+template <class _T1, class _T2, class _T3>
+struct __is_supported_reduction : std::false_type {};
+
+#define _LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(func)                                                                      \
+  template <class _Tp>                                                                                                 \
+  struct __is_supported_reduction<func<_Tp>, _Tp, _Tp> : true_type {};                                                 \
+  template <class _Tp, class _Up>                                                                                      \
+  struct __is_supported_reduction<func<>, _Tp, _Up> : true_type {};
+
+// __is_trivial_plus_operation already exists
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::plus)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::minus)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::multiplies)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::logical_and)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::logical_or)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_and)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_or)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_xor)
+
+template <>
+struct __transform_reduce<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+  template <class _Policy, class _ForwardIterator, class _Tp, class _Reduction, class _Transform>
+  _LIBCPP_HIDE_FROM_ABI optional<_Tp>
+  operator()(_Policy&& __policy,
+             _ForwardIterator __first,
+             _ForwardIterator __last,
+             _Tp __init,
+             _Reduction __reduce,
+             _Transform __transform) const noexcept {
+    using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+                  __is_supported_reduction<_Reduction, _Tp, _Tp>::value && is_trivially_copyable_v<_ValueType>) {
+      return __pstl::__omp_transform_reduce(
+          std::__unwrap_iter(__first), __last - __first, __init, std::move(__reduce), std::move(__transform));
+    } else {
+      using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+      using _Fallback = __dispatch<__pstl::__transform_reduce, _Backends, __remove_cvref_t<_Policy>>;
+      return _Fallback{}(
+          std::forward<_Policy>(__policy),
+          std::move(__first),
+          std::move(__last),
+          std::move(__init),
+          std::move(__reduce),
+          std::move(__transform));
+    }
+  }
+};
+
+//
+// transform_reduce_binary
+//
+template <>
+struct __transform_reduce_binary<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+  template <class _Policy,
+            class _ForwardIterator1,
+            class _ForwardIterator2,
+            class _Tp,
+            class _Reduction,
+            class _Transform>
+  _LIBCPP_HIDE_FROM_ABI optional<_Tp> operator()(
+      _Policy&& __policy,
+      _ForwardIterator1 __first1,
+      _ForwardIterator1 __last1,
+      _ForwardIterator2 __first2,
+      _Tp __init,
+      _Reduction __reduce,
+      _Transform __transform) const noexcept {
+    using _ValueType1 = typename iterator_traits<_ForwardIterator1>::value_type;
+    using _ValueType2 = typename iterator_traits<_ForwardIterator2>::value_type;
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                  __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+                  __is_supported_reduction<_Reduction, _Tp, _Tp>::value && is_trivially_copyable_v<_ValueType1> &&
+                  is_trivially_copyable_v<_ValueType2>) {
+      return __pstl::__omp_transform_reduce(
+          std::__unwrap_iter(__first1),
+          std::__unwrap_iter(__first2),
+          __last1 - __first1,
+          std::move(__init),
+          std::move(__reduce),
+          std::move(__transform));
+    } else {
+      using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+      using _Fallback = __dispatch<__pstl::__transform_reduce_binary, _Backends, __remove_cvref_t<_Policy>>;
+      return _Fallback{}(
+          std::forward<_Policy>(__policy),
+          std::move(__first1),
+          std::move(__last1),
+          std::move(__first2),
+          std::move(__init),
+          std::move(__reduce),
+          std::move(__transform));
+    }
+  }
+};
+
+} // namespace __pstl
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___PSTL_BACKENDS_OPENMP_H
diff --git a/libcxx/include/__pstl/dispatch.h b/libcxx/include/__pstl/dispatch.h
index 828842368e3394a..381e21849b83cf1 100644
--- a/libcxx/include/__pstl/dispatch.h
+++ b/libcxx/include/__pstl/dispatch.h
@@ -61,6 +61,21 @@ template <template <class, class> class _Algorithm, class _BackendConfiguration,
 using __dispatch _LIBCPP_NODEBUG =
     typename __find_first_implemented<_Algorithm, _BackendConfiguration, _ExecutionPolicy>::type;
 
+template <class _BackendConfiguration, class _Backend>
+struct __backends_after_impl;
+
+template <class _Backend, class... _RemainingBackends>
+struct __backends_after_impl<__backend_configuration<_Backend, _RemainingBackends...>, _Backend> {
+  using type = __backend_configuration<_RemainingBackends...>;
+};
+
+template <class _B1, class... _Bn, class _Backend>
+struct __backends_after_impl<__backend_configuration<_B1, _Bn...>, _Backend>
+    : __backends_after_impl<__backend_configuration<_Bn...>, _Backend> {};
+
+template <class _BackendConfiguration, class _Backend>
+using __backends_after = typename __backends_after_impl<_BackendConfiguration, _Backend>::type;
+
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 86efbd36b20d1df..075e5d73889b652 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -2165,6 +2165,11 @@ module std [system] {
         export std.pstl.cpu_algos
         export std_core.utility_core.empty
       }
+      module openmp {
+        header "__pstl/backends/openmp.h"
+        export std.pstl.cpu_algos
+        export std_core.utility_core.empty
+      }
     }
     module cpu_algos {
       module any_of {
diff --git a/libcxx/test/configs/cmake-bridge.cfg.in b/libcxx/test/configs/cmake-bridge.cfg.in
index 61f821a7e4f6b89..52c29c20cc4011f 100644
--- a/libcxx/test/configs/cmake-bridge.cfg.in
+++ b/libcxx/test/configs/cmake-bridge.cfg.in
@@ -23,6 +23,7 @@ config.recursiveExpansionLimit = 10
 config.test_exec_root = os.path.join('@LIBCXX_BINARY_DIR@', 'test')
 
 # Add substitutions for bootstrapping the test suite configuration
+config.substitutions.append(('%{binary-dir}', '@LIBCXX_BINARY_DIR@'))
 config.substitutions.append(('%{libcxx-dir}', '@LIBCXX_SOURCE_DIR@'))
 config.substitutions.append(('%{install-prefix}', '@LIBCXX_TESTING_INSTALL_PREFIX@'))
 config.substitutions.append(('%{include-dir}', '@LIBCXX_TESTING_INSTALL_PREFIX@/@LIBCXX_INSTALL_INCLUDE_DIR@'))
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
new file mode 100644
index 000000000000000..6c96f26bc3c3abd
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but std::for_each(std::execution::par_unseq,...) is not executed on
+// the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+// Enable openmp offload for this test.
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native -fno-exceptions
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(int, char**) {
+  // Initializing test array
+  const int test_size = 10000;
+  std::vector<int> v(test_size, 2);
+
+  // By making an extra map, we can control when the data is mapped to and from
+  // the device, because the map inside std::fill will then only increment and
+  // decrement reference counters and not move data.
+  int* data = v.data();
+#pragma omp target enter data map(to : data[0 : v.size()])
+  std::fill(std::execution::par_unseq, v.begin(), v.end(), -2);
+
+  // At this point v should only contain the value 2 if we executed on a device.
+  if (omp_get_num_devices())
+    for (int vi : v)
+      assert(
+          vi == 2 &&
+          "std::fill transferred data from device to the host but should only have decreased the reference counter.");
+
+// After moving the result back to the host it should now be -2
+#pragma omp target update from(data[0 : v.size()])
+  for (int vi : v)
+    assert(vi == -2 && "std::fill did not update the result on the device.");
+
+#pragma omp target exit data map(delete : data[0 : v.size()])
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
new file mode 100644
index 000000000000000..580deb7c120a953
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that std::find_if(std::execution::par_unseq,...) always
+// finds the first entry in a vector matching the condition. If it was confused
+// with std::any_of, it could return the indexes in a non-increasing order.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+// Enable openmp offload for this test.
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native -fno-exceptions
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <execution>
+#include <vector>
+
+template <class _Tp>
+void check_find_if(_Tp& data) {
+  const int len = data.end() - data.begin();
+  // Decrementing the values in the test indices
+  int idx[11] = {0, len / 10, len / 9, len / 8, len / 7, len / 6, len / 5, len / 4, len / 3, len / 2, len - 1};
+  for (auto i : idx) {
+    data[i] -= 1;
+  };
+
+  // Asserting that the minimas are found in the correct order
+  for (auto i : idx) {
+    auto found_min = std::find_if(
+        std::execution::par_unseq, data.begin(), data.end(), [&](decltype(data[0]) & n) -> bool { return n < 2; });
+    assert(found_min == (data.begin() + i));
+    // Incrementing the minimum, so the next one can be found
+    (*found_min) += 1;
+  }
+}
+
+int main(int, char**) {
+  const int test_size = 10000;
+  // Testing with vector of doubles
+  {
+    std::vector<double> v(test_size, 2.0);
+    check_find_if(v);
+  }
+  // Testing with vector of integers
+  {
+    std::vector<int> v(test_size, 2);
+    check_find_if(v);
+  }
+  // Testing with array of doubles
+  {
+    std::array<double, test_size> a;
+    a.fill(2.0);
+    check_find_if(a);
+  }
+  // Testing with array of integers
+  {
+    std::array<int, test_size> a;
+    a.fill(2);
+    check_find_if(a);
+  }
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
new file mode 100644
index 000000000000000..1610aafdb2219e3
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that we can provide function pointers as input to
+// std::find_if. The OpenMP declare target directive with the `indirect` clause
+// makes an implicit mapping of the host and device function pointers.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+// Enable openmp offload for this test.
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native -fno-exceptions
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+bool is_odd(int& i) { return (i % 2) == 1; }
+#pragma omp declare target indirect to(is_odd)
+
+int main(int, char**) {
+  const int test_size = 10000;
+  std::vector<int> v(test_size, 2.0);
+  v[123] = 3;
+
+  // Providing for_each a function pointer
+  auto idx = std::find_if(std::execution::par_unseq, v.begin(), v.end(), is_odd);
+
+  assert(idx - v.begin() == 123 && "std::find_if(std::execution::par_unseq,...) does not accept function pointers");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
new file mode 100644
index 000000000000000..d2cff1c97652c25
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but syd::find_if(std::execution::par_unseq,...) is not executed on
+// the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+// Enable openmp offload for this test.
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native -fno-exceptions
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int test_size = 10000;
+  std::vector<double> v(test_size, 1);
+
+  auto idx = std::find_if(std::execution::par_unseq, v.begin(), v.end(), [](double&) -> bool {
+    // Returns true if executed on the host
+    return omp_is_initial_device();
+  });
+  assert(idx == v.end() &&
+         "omp_is_initial_device() returned true in the target region. std::find_if was not offloaded.");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
new file mode 100644
index 000000000000000..3430d48184fc9f9
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that we can provide function pointers as input to
+// std::for_each. The OpenMP declare target directive with the `indirect` clause
+// makes an implicit mapping of the host and device function pointers.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+// Enable openmp offload for this test.
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native -fno-exceptions
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+void cube(double& d) { d *= d * d; }
+#pragma omp declare target indirect to(cube)
+
+int main(int, char**) {
+  const int test_size = 10000;
+  std::vector<double> v(test_size, 2.0);
+
+  // Providing for_each a function pointer
+  std::for_each(std::execution::par_unseq, v.begin(), v.end(), cube);
+
+  for (int vi : v)
+    assert(vi == 8 && "std::for_each(std::execution::par_unseq,...) does not accept function pointers");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
new file mode 100644
index 000000000000000..57c4f4dc3681a94
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that we can provide a lambda as input to std::for_each in
+// different ways.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+// Enable openmp offload for this test.
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native -fno-exceptions
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+template <class Function, class Tp>
+void test_lambda(Function fun, Tp initial_value, Tp final_value) {
+  const int test_size = 10000;
+  std::vector<double> v(test_size, initial_value);
+
+  // Providing for_each a function pointer
+  std::for_each(std::execution::par_unseq, v.begin(), v.end(), fun);
+
+  for (int vi : v)
+    assert(vi == final_value && "std::for_each(std::execution::par_unseq,...) does not accept lambdas");
+}
+
+int main(int, char**) {
+  // Capturing by reference
+  auto cube_ref = [&](double& a) { a *= a * a; };
+  test_lambda(cube_ref, 2.0, 8.0);
+
+  // Capturing by value
+  auto cube_val = [=](double& a) { a *= a * a; };
+  test_lambda(cube_val, 2.0, 8.0);
+
+  // Capturing by reference when using additional input
+  double c1       = 1.0;
+  auto cube_ref_2 = [&](double& a) { a = a * a * a + c1; };
+#pragma omp target data map(to : c1)
+  test_lambda(cube_ref_2, 2.0, 9.0);
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
new file mode 100644
index 000000000000000..7436b012a703b2b
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but for_each(std::execution::par_unseq,...) is not executed on the
+// device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+// Enable openmp offload for this test.
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native -fno-exceptions
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int test_size = 10000;
+  std::vector<int> v(test_size);
+  std::for_each(std::execution::par_unseq, v.begin(), v.end(), [](int& n) {
+    // Returns true if executed on the host
+    n = omp_is_initial_device();
+  });
+
+  for (int vi : v)
+    assert(vi == 0 && "omp_is_initial_device() returned true in the target region. std::for_each was not offloaded.");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
new file mode 100644
index 000000000000000..1a2c6540efeaad9
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that you can overwrite the input in
+// std::for_each(std::execution::par_unseq,...). If the result was not copied
+// back from the device to the host, this test would fail.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+// Enable openmp offload for this test.
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native -fno-exceptions
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <execution>
+#include <vector>
+
+template <class _Tp, class _Predicate, class _Up>
+void overwrite(_Tp& data, _Predicate pred, const _Up& value) {
+  // This function assumes that pred will never be the identity transformation
+
+  // Updating the array with a lambda
+  std::for_each(std::execution::par_unseq, data.begin(), data.end(), pred);
+
+  // Asserting that no elements have the intial value
+  for (int di : data)
+    assert(
+        di != value &&
+        "The GPU implementation of std::for_each does not allow users to mutate the input as the C++ standard does.");
+}
+
+int main(int, char**) {
+  const double value  = 2.0;
+  const int test_size = 10000;
+  // Testing with vector of doubles
+  {
+    std::vector<double> v(test_size, value);
+    overwrite(v, [&](double& n) { n *= n; }, value);
+  }
+  // Testing with vector of integers
+  {
+    std::vector<int> v(test_size, (int)value);
+    overwrite(v, [&](int& n) { n *= n; }, (int)value);
+  }
+  // Testing with array of doubles
+  {
+    std::array<double, test_size> a;
+    a.fill(value);
+    overwrite(a, [&](double& n) { n *= n; }, value);
+  }
+  // Testing with array of integers
+  {
+    std::array<int, test_size> a;
+    a.fill((int)value);
+    overwrite(a, [&](int& n) { n *= n; }, (int)value);
+  }
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
new file mode 100644
index 000000000000000..4e7f84eff461a47
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that the libc++ test configuration forwards the AMD and
+// NVIDIA environment variables specifying the visible devices. Intially when
+// developing the OpenMP offloading tests, this was not the case, and this test
+// will reveal if the configuration is wrong another time.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+// Enable openmp offload for this test.
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp-targets=%{triple} -fno-exceptions
+
+#include <string>
+#include <cassert>
+#include <omp.h>
+#include <stdio.h>
+#include <iostream>
+
+std::string get_env_var(std::string const& env_var_name, int& flag) {
+  char* val;
+  val                = getenv(env_var_name.c_str());
+  std::string retval = "";
+  flag               = (val != NULL);
+  return (val != NULL) ? val : "";
+}
+
+int main(int, char**) {
+  // Stores whether the environment variable was found
+  int status = 0;
+
+  // Checking for AMD's enviroment variable for specifying visible devices
+  std::string rocr_visible_devices = get_env_var("ROCR_VISIBLE_DEVICES", status);
+  if (status)
+    assert(
+        (rocr_visible_devices.empty() || (omp_get_num_devices() > 0)) &&
+        "ROCR_VISIBLE_DEVICES was set but no devices were detected by OpenMP. The libc++ test suite is misconfigured.");
+
+  // Checking for NVIDIA's enviroment variable for specifying visible devices
+  std::string cuda_visible_devices = get_env_var("CUDA_VISIBLE_DEVICES", status);
+  if (status)
+    assert(
+        (cuda_visible_devices.empty() || (omp_get_num_devices() > 0)) &&
+        "CUDA_VISIBLE_DEVICES was set but no devices were detected by OpenMP. The libc++ test suite is misconfigured.");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp
new file mode 100644
index 000000000000000..bd23f774b32762f
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that a diagnostic error is prompted if the OpenMP version is below
+// the minimum required version.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=40
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+
+// expected-error at __pstl/backends/openmp.h:103 {{"OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."}}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp
new file mode 100644
index 000000000000000..4765ca0d540b346
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that one can include algorithm without any diagnostics when using
+// the minimum required version of OpenMP.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=45
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+
+// expected-no-diagnostics
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp
new file mode 100644
index 000000000000000..b7836cb94254869
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that one can include algorithm without any diagnostics when using a
+// version that is newer than the minimum requirement.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=51
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+
+// expected-no-diagnostics
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
new file mode 100644
index 000000000000000..dc3607d95c7e1b0
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but std::transform(std::execution::par_unseq,...) is not executed
+// on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+// Enable openmp offload for this test.
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native -fno-exceptions
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test arrays
+  const int test_size = 10000;
+  std::vector<int> host(test_size);
+  std::vector<int> device(test_size);
+  // Should execute on host
+  std::transform(std::execution::unseq, host.begin(), host.end(), host.begin(), [](int& h) {
+    // Returns true if executed on the host
+    h = omp_is_initial_device();
+    return h;
+  });
+
+  // Asserting the std::transform(std::execution::unseq,...) executed on the host
+  for (int hi : host)
+    assert(hi && "omp_is_initial_device() returned false. std::transform was offloaded but shouldn't be.");
+
+  // Should execute on device
+  std::transform(
+      std::execution::par_unseq, device.begin(), device.end(), host.begin(), device.begin(), [](int& d, int& h) {
+        // Should return fals
+        d = omp_is_initial_device();
+        return h == d;
+      });
+
+  // Asserting the std::transform(std::execution::par_unseq,...) executed on the device
+  for (int di : device)
+    assert(!di && "omp_is_initial_device() returned true in the target region. std::transform was not offloaded.");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
new file mode 100644
index 000000000000000..d9bf79832ba2112
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but std::transform_reduce(std::execution::par_unseq,...) is not
+// executed on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+// Enable openmp offload for this test.
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native -fno-exceptions
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <functional>
+#include <numeric>
+#include <omp.h>
+#include <vector>
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int test_size = 10000;
+  std::vector<int> v(test_size, 1);
+  std::vector<int> w(test_size, 1);
+
+  int result = std::transform_reduce(
+      std::execution::par_unseq, v.begin(), v.end(), w.begin(), (int)0, std::plus{}, [](int& n, int& m) {
+        return n + m + omp_is_initial_device(); // Gives 2 if executed on device, 3 if executed on host
+      });
+  assert(result == 2 * test_size &&
+         "omp_is_initial_device() returned true in the target region. std::transform_reduce was not offloaded.");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
new file mode 100644
index 000000000000000..4d19261b7586177
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
@@ -0,0 +1,203 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that std::transform_reduce(std::execution::par_unseq,...)
+// can be offloaded for a number of supported binary operations. The following
+// binary operations should be supported for the reducer:
+// - std::plus
+// - std::minus
+// - std::multiplies
+// - std::logical_and
+// - std::logical_or
+// - std::bit_and
+// - std::bit_or
+// - std::bit_xor
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+// Enable openmp offload for this test.
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native -fno-exceptions
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <execution>
+#include <functional>
+#include <iostream>
+#include <numeric>
+#include <omp.h>
+#include <vector>
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int test_size = 10000;
+
+  //===--------------------------------------------------------------------===//
+  // Arithmetic binary operators
+  //===--------------------------------------------------------------------===//
+
+  // Addition with doubles
+  {
+    std::vector<double> v(test_size, 1.0);
+    std::vector<double> w(test_size, 2.0);
+    double result = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), w.begin(), 5.0, std::plus{}, [](double& a, double& b) {
+          return 0.5 * (b - a) * ((double)!omp_is_initial_device());
+        });
+    assert((std::abs(result - 0.5 * ((double)test_size) - 5.0) < 1e-8) &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::plus.");
+  }
+
+  // Subtraction of floats
+  {
+    std::vector<float> v(test_size, 1.0f);
+    std::vector<float> w(test_size, 1.5f);
+    float result = std::transform_reduce(
+        std::execution::par_unseq,
+        v.begin(),
+        v.end(),
+        w.begin(),
+        1.25 * ((float)test_size),
+        std::minus{},
+        [](float& a, float& b) { return 0.5 * (a + b) * ((float)!omp_is_initial_device()); });
+    assert((std::abs(result) < 1e-8f) &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the "
+           "intended effect for the binary operation std::minus.");
+  }
+
+  // Multiplication of doubles
+  {
+    std::vector<double> v(test_size, 1.0);
+    std::vector<double> w(test_size, 0.0001);
+    double result = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), w.begin(), -1.0, std::multiplies{}, [](double& a, double& b) {
+          return (a + b) * ((double)!omp_is_initial_device());
+        });
+    assert((std::abs(result + pow(1.0001, test_size)) < 1e-8) &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::multiplies.");
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Logical binary operators
+  //===--------------------------------------------------------------------===//
+
+  // Logical and
+  {
+    std::vector<int> v(test_size, 1);
+    // The result should be true with an initial value of 1
+    int result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 1, std::logical_and{}, [](int& a) {
+          return a && !omp_is_initial_device();
+        });
+    assert(result &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::logical_and.");
+
+    // And false by an initial value of 0
+    result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::logical_and{}, [](int& a) {
+      return a && !omp_is_initial_device();
+    });
+    assert(!result &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::logical_and.");
+  }
+
+  // Logical or
+  {
+    std::vector<int> v(test_size, 0);
+    // The result should be true with an initial value of 1
+    int result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 1, std::logical_or{}, [](int& a) {
+      return a && !omp_is_initial_device();
+    });
+    assert(result &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::logical_or.");
+
+    // And false by an initial value of 0
+    result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::logical_or{}, [](int& a) {
+      return a && !omp_is_initial_device();
+    });
+    assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                      "binary operation std::logical_or.");
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Bitwise binary operators
+  //===--------------------------------------------------------------------===//
+
+  // Bitwise and
+  {
+    std::vector<unsigned int> v(test_size, 3);
+    std::vector<unsigned int> w(test_size, 2);
+    // For odd numbers the result should be true
+    int result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0x1, std::bit_and{}, [](unsigned int& a) {
+          return a + omp_is_initial_device();
+        });
+    assert(result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                     "binary operation std::bit_and.");
+
+    // For even numbers the result should be false
+    result =
+        std::transform_reduce(std::execution::par_unseq, w.begin(), w.end(), 0x1, std::bit_and{}, [](unsigned int& a) {
+          return a + omp_is_initial_device();
+        });
+    assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                      "binary operation std::bit_and.");
+  }
+
+  // Bitwise or
+  {
+    std::vector<unsigned int> v(test_size, 0);
+    int result = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) -> unsigned int {
+          return a || omp_is_initial_device();
+        });
+    assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                      "binary operation std::bit_or.");
+
+    // After adding a one, the result should be true
+    v[v.size() / 2] = 1;
+    result          = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) -> unsigned int {
+          return a && !omp_is_initial_device();
+        });
+    assert(result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                     "binary operation std::bit_or.");
+  }
+
+  // Bitwise xor
+  {
+    std::vector<unsigned int> v(test_size, 0xef);
+    int result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_xor{}, [](unsigned int& a) {
+          return a << omp_is_initial_device();
+        });
+    assert(result == 0 && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for "
+                          "the binary operation std::bit_or.");
+
+    // After adding a one, the result should be true
+    v[v.size() / 2] = 0xea;
+    result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_xor{}, [](unsigned int& a) {
+          return a << omp_is_initial_device();
+        });
+    assert(result == 5 && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for "
+                          "the binary operation std::bit_or.");
+  }
+
+  return 0;
+}
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index e4b413deff4dba9..d3742e03efd98c9 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -370,6 +370,7 @@ def _mingwSupportsModules(cfg):
     "_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR": "libcpp-deprecated-abi-disable-pair-trivial-copy-ctor",
     "_LIBCPP_ABI_NO_COMPRESSED_PAIR_PADDING": "libcpp-abi-no-compressed-pair-padding",
     "_LIBCPP_PSTL_BACKEND_LIBDISPATCH": "libcpp-pstl-backend-libdispatch",
+    "_LIBCPP_PSTL_BACKEND_OPENMP": "libcpp-pstl-backend-openmp",
 }
 for macro, feature in macros.items():
     DEFAULT_FEATURES.append(
@@ -379,6 +380,23 @@ def _mingwSupportsModules(cfg):
         )
     )
 
+DEFAULT_FEATURES.append(
+    Feature(
+        name="libcpp-pstl-backend-openmp",
+        when=lambda cfg: "_LIBCPP_PSTL_BACKEND_OPENMP" in compilerMacros(cfg),
+        actions=[
+            # Do not add -fopenmp-targets for all tests as it includes other libraries
+            # and that can mess things up. OpenMP tests will add the flag themselves.
+            # However, add everything else needed for OpenMP and offloading:
+            AddFlagIfSupported("-fopenmp"),
+            # The linker needs to find the correct version of libomptarget and make sure
+            # it is used by the binary even in the test environment.
+            AddLinkFlag("-L%{binary-dir}/../../../lib"),
+            AddLinkFlag("-Wl,-rpath,%{binary-dir}/../../../lib"),
+        ],
+    )
+)
+
 true_false_macros = {
     "_LIBCPP_HAS_THREAD_API_EXTERNAL": "libcpp-has-thread-api-external",
     "_LIBCPP_HAS_THREAD_API_PTHREAD": "libcpp-has-thread-api-pthread",
diff --git a/libcxx/utils/run.py b/libcxx/utils/run.py
index 6b4d615444bcfa5..012da9a76462c26 100755
--- a/libcxx/utils/run.py
+++ b/libcxx/utils/run.py
@@ -64,6 +64,24 @@ def main():
         if "TEMP" in os.environ:
             env["TEMP"] = os.environ.get("TEMP")
 
+    # Forwarding offload specific environment variables.
+    for GPU_OPTION in [
+        "CUDA_VISIBLE_DEVICES",
+        "ROCR_VISIBLE_DEVICES",
+        "LIBOMPTARGET_INFO",
+        "LIBOMPTARGET_DEBUG"
+    ]:
+        if GPU_OPTION in os.environ:
+            env[GPU_OPTION] = os.environ[GPU_OPTION]
+
+    # If ROCM_PATH is set, forward it and put the ROCM libraries onto the LD_LIBRARY_PATH too.
+    if "ROCM_PATH" in os.environ:
+        env["ROCM_PATH"] = os.environ["ROCM_PATH"]
+        if not "LD_LIBRARY_PATH" in env:
+            env["LD_LIBRARY_PATH"] = env["ROCM_PATH"] + "/lib"
+        else:
+            env["LD_LIBRARY_PATH"] += os.pathsep + env["ROCM_PATH"] + "/lib"
+
     # Run the command line with the given environment in the execution directory.
     return subprocess.call(commandLine, cwd=args.execdir, env=env, shell=False)
 
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index 6dcfc51e553213c..fff48b90bf0a5c7 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -455,6 +455,14 @@ if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
   add_definitions("-D_XOPEN_SOURCE=700")
 endif()
 
+# If the OpenMP PSTL backend has been enabled for libcxx, OpenMP must be
+# enabled during compilation
+if (DEFINED LIBCXX_PSTL_BACKEND)
+  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+    add_compile_options(-fopenmp)
+  endif()
+endif()
+
 #===============================================================================
 # Setup Source Code
 #===============================================================================