[libcxx] [libcxxabi] [llvm] Adding Separate OpenMP Offloading Backend to `libcxx/include/__algorithm/pstl_backends` (PR #66968)

Tue Jun 18 10:15:31 PDT 2024

https://github.com/AntonRydahl updated https://github.com/llvm/llvm-project/pull/66968

>From 10d408f5335e7e61065118f86bf1bcdc1287aefa Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 20 Sep 2023 17:06:10 -0700
Subject: [PATCH] Adding OpenMP Offloading Backend for C++ Parallel Algorithms

---
 .github/workflows/libcxx-build-and-test.yaml  |   1 +
 libcxx/CMakeLists.txt                         |  12 +-
 libcxx/cmake/caches/Generic-pstl-openmp.cmake |   1 +
 libcxx/docs/BuildingLibcxx.rst                |  11 +
 libcxx/docs/UsingLibcxx.rst                   | 104 ++++
 libcxx/include/CMakeLists.txt                 |   1 +
 libcxx/include/__config_site.in               |   1 +
 libcxx/include/__pstl/backend.h               |   4 +
 libcxx/include/__pstl/backend_fwd.h           |   4 +
 libcxx/include/__pstl/backends/openmp.h       | 511 ++++++++++++++++++
 libcxx/include/__pstl/dispatch.h              |  15 +
 libcxx/include/module.modulemap               |   4 +
 .../alg.pstl.openmp/fill_offload.pass.cpp     |  52 ++
 .../alg.pstl.openmp/find_if.pass.cpp          |  67 +++
 .../alg.pstl.openmp/find_if_funptr.pass.cpp   |  36 ++
 .../alg.pstl.openmp/find_if_offload.pass.cpp  |  39 ++
 .../alg.pstl.openmp/for_each_funptr.pass.cpp  |  36 ++
 .../alg.pstl.openmp/for_each_lambda.pass.cpp  |  49 ++
 .../alg.pstl.openmp/for_each_offload.pass.cpp |  39 ++
 .../for_each_overwrite_input.pass.cpp         |  63 +++
 .../gpu_environment_variables.pass.cpp        |  49 ++
 .../openmp_version_40.verify.cpp              |  21 +
 .../openmp_version_45.verify.cpp              |  21 +
 .../openmp_version_51.verify.cpp              |  21 +
 .../transform_offload.pass.cpp                |  55 ++
 .../transform_reduce_offload.pass.cpp         |  41 ++
 ...educe_supported_binary_operations.pass.cpp | 199 +++++++
 libcxx/utils/ci/run-buildbot                  |  32 ++
 libcxx/utils/libcxx/test/features.py          |  22 +
 libcxx/utils/run.py                           |  15 +
 libcxxabi/CMakeLists.txt                      |   8 +
 31 files changed, 1532 insertions(+), 2 deletions(-)
 create mode 100644 libcxx/cmake/caches/Generic-pstl-openmp.cmake
 create mode 100644 libcxx/include/__pstl/backends/openmp.h
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index d7c21394ca486..2f25e3b0c47dc 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -153,6 +153,7 @@ jobs:
           'generic-no-wide-characters',
           'generic-no-rtti',
           'generic-optimized-speed',
+          'generic-pstl-openmp',
           'generic-static',
           # TODO Find a better place for the benchmark and bootstrapping builds to live. They're either very expensive
           # or don't provide much value since the benchmark run results are too noise on the bots.
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 4b927017f8c2a..c1dec6bf54ab8 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -301,10 +301,11 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
 
 if (LIBCXX_ENABLE_THREADS)
-  set(LIBCXX_PSTL_BACKEND "std_thread" CACHE STRING "Which PSTL backend to use")
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "std_thread")
 else()
-  set(LIBCXX_PSTL_BACKEND "serial" CACHE STRING "Which PSTL backend to use")
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "serial")
 endif()
+set(LIBCXX_PSTL_BACKEND "${LIBCXX_PSTL_BACKEND_DEFAULT}" CACHE STRING "Select the PSTL backend to use. Valid values are serial, std-thread, libdispatch, openmp. Default: ${LIBCXX_PSTL_BACKEND_DEFAULT}")
 
 # Misc options ----------------------------------------------------------------
 # FIXME: Turn -pedantic back ON. It is currently off because it warns
@@ -571,6 +572,11 @@ function(cxx_add_basic_build_flags target)
     endif()
   endif()
   target_compile_options(${target} PUBLIC "${LIBCXX_ADDITIONAL_COMPILE_FLAGS}")
+
+  # If the PSTL backend depends on OpenMP, we must enable the OpenMP tool chain
+  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+    target_add_compile_flags_if_supported(${target} PUBLIC -fopenmp)
+  endif()
 endfunction()
 
 # Exception flags =============================================================
@@ -800,6 +806,8 @@ elseif(LIBCXX_PSTL_BACKEND STREQUAL "std_thread")
   config_define(1 _LIBCPP_PSTL_BACKEND_STD_THREAD)
 elseif(LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   config_define(1 _LIBCPP_PSTL_BACKEND_LIBDISPATCH)
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+  config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
 else()
   message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
                        Valid backends are: serial, std_thread and libdispatch")
diff --git a/libcxx/cmake/caches/Generic-pstl-openmp.cmake b/libcxx/cmake/caches/Generic-pstl-openmp.cmake
new file mode 100644
index 0000000000000..f3ff4f3b57fd2
--- /dev/null
+++ b/libcxx/cmake/caches/Generic-pstl-openmp.cmake
@@ -0,0 +1 @@
+set(LIBCXX_PSTL_BACKEND openmp CACHE STRING "")
diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst
index e425b9dadfe7d..5727005e24fbd 100644
--- a/libcxx/docs/BuildingLibcxx.rst
+++ b/libcxx/docs/BuildingLibcxx.rst
@@ -424,6 +424,17 @@ libc++ Feature Options
   provided, this header will be included by the library, replacing the
   default assertion handler.
 
+.. option:: LIBCXX_PSTL_BACKEND:STRING
+
+  **Default**:: ``"serial"``
+
+  **Values**:: ``serial``, ``std-thread``, ``libdispatch``, ``openmp``
+
+  Select the desired backend for C++ parallel algorithms. All four options can
+  target multi-core CPU architectures, and ``openmp`` can additionally target
+  GPU architectures. The ``openmp`` backend requires OpenMP version 4.5 or
+  later.
+
 
 libc++ ABI Feature Options
 --------------------------
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index df08875c13bea..f1e7b19ead579 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -364,6 +364,110 @@ Unpoisoning may not be an option, if (for example) you are not maintaining the a
 * You are using allocator, which does not call destructor during deallocation.
 * You are aware that memory allocated with an allocator may be accessed, even when unused by container.
 
+Offloading C++ Parallel Algorithms to GPUs
+------------------------------------------
+
+Experimental support for GPU offloading has been added to ``libc++``. The
+implementation uses OpenMP target offloading to leverage GPU compute resources.
+The OpenMP PSTL backend can target both NVIDIA and AMD GPUs.
+However, the implementation only supports contiguous iterators, such as
+iterators for ``std::vector`` or ``std::array``.
+To enable the OpenMP offloading backend it must be selected with
+``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when
+compiling a program, the user must specify the command line options
+``-fopenmp -fexperimental-library``. To install LLVM with OpenMP offloading
+enabled, please read
+`the LLVM OpenMP FAQ. <https://openmp.llvm.org/SupportAndFAQ.html>`_
+You may also want to to visit
+`the OpenMP offloading command-line argument reference. <https://openmp.llvm.org/CommandLineArgumentReference.html#offload-command-line-arguments>`_
+
+Example
+~~~~~~~
+
+The following is an example of offloading vector addition to a GPU using our
+standard library extension. It implements the classical vector addition from
+BLAS that overwrites the vector ``y`` with ``y=a*x+y``. Thus ``y.begin()`` is
+both used as an input and an output iterator in this example.
+
+.. code-block:: cpp
+
+  #include <algorithm>
+  #include <execution>
+
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [=](T2 xi, T3 yi) { return a * xi + yi; });
+  }
+
+The execution policy ``std::execution::par_unseq`` states that the algorithm's
+execution may be parallelized, vectorized, and migrated across threads. This is
+the only execution mode that is safe to offload to GPUs, and for all other
+execution modes the algorithms will execute on the CPU.
+Special attention must be paid to the lambda captures when enabling GPU
+offloading. If the lambda captures by reference, the user must manually map the
+variables to the device. If capturing by reference, the above example could
+be implemented in the following way.
+
+.. code-block:: cpp
+
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+  #pragma omp target data map(to : a)
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [&](T2 xi, T3 yi) { return a * xi + yi; });
+  }
+
+However, if unified shared memory, USM, is enabled, no additional data mapping
+is necessary when capturing y reference.
+
+Compiling functions for GPUs with OpenMP
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The C++ standard defines that all accesses to memory are inside a single address
+space. However, discrete GPU systems have distinct address spaces. A single
+address space can be emulated if your system supports unified shared memory.
+However, many discrete GPU systems do not, and in those cases it is important to
+pass device function pointers to the parallel algorithms. Below is an example of
+how the OpenMP ``declare target`` directive with the ``indirect`` clause can be
+used to mark that a function should be compiled for both host and device.
+
+.. code-block:: cpp
+
+  // This function computes the squared difference of two floating points
+  float squared(float a, float b) { return a * a - 2.0f * a * b + b * b; };
+
+  // Declare that the function must be compiled for both host and device
+  #pragma omp declare target indirect to(squared)
+
+  int main() {
+    std::vector<float> a(100, 1.0);
+    std::vector<float> b(100, 1.25);
+
+    // Pass the host function pointer to the parallel algorithm and let OpenMP
+    // translate it to the device function pointer internally
+    float sum =
+        std::transform_reduce(std::execution::par_unseq, a.begin(), a.end(),
+                              b.begin(), 0.0f, std::plus{}, squared);
+
+    // Validate that the result is approximately 6.25
+    assert(std::abs(sum - 6.25f) < 1e-10);
+    return 0;
+  }
+
+Without unified shared memory, the above example will not work if the host
+function pointer ``squared`` is passed to the parallel algorithm.
+
+Important notes about exception handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+GPU architectures do not support exception handling. If compiling a program
+containing parallel algorithms with current versions of Clang, a program with
+exceptions in offloaded code regions will compile, but the program will
+terminate if an exception is thrown on the device. This does not conform with
+the C++ standard and exception handling on GPUs will hopefully be better
+supported in future releases of LLVM.
+
 Platform specific behavior
 ==========================
 
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 8d0ffd6ed725b..f641abc38bb81 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -579,6 +579,7 @@ set(files
   __pstl/backend_fwd.h
   __pstl/backends/default.h
   __pstl/backends/libdispatch.h
+  __pstl/backends/openmp.h
   __pstl/backends/serial.h
   __pstl/backends/std_thread.h
   __pstl/cpu_algos/any_of.h
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index 89a14609ee3f9..00693aeb919cb 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -35,6 +35,7 @@
 #cmakedefine _LIBCPP_PSTL_BACKEND_SERIAL
 #cmakedefine _LIBCPP_PSTL_BACKEND_STD_THREAD
 #cmakedefine _LIBCPP_PSTL_BACKEND_LIBDISPATCH
+#cmakedefine _LIBCPP_PSTL_BACKEND_OPENMP
 
 // Hardening.
 #cmakedefine _LIBCPP_HARDENING_MODE_DEFAULT @_LIBCPP_HARDENING_MODE_DEFAULT@
diff --git a/libcxx/include/__pstl/backend.h b/libcxx/include/__pstl/backend.h
index 86d9f28c77fa8..cb47501c19fc8 100644
--- a/libcxx/include/__pstl/backend.h
+++ b/libcxx/include/__pstl/backend.h
@@ -28,6 +28,10 @@ _LIBCPP_PUSH_MACROS
 #elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
 #  include <__pstl/backends/default.h>
 #  include <__pstl/backends/libdispatch.h>
+#elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+#  include <__pstl/backends/default.h>
+#  include <__pstl/backends/openmp.h>
+#  include <__pstl/backends/std_thread.h>
 #endif
 
 _LIBCPP_POP_MACROS
diff --git a/libcxx/include/__pstl/backend_fwd.h b/libcxx/include/__pstl/backend_fwd.h
index 32c5da576fb3c..ed08d45206a8b 100644
--- a/libcxx/include/__pstl/backend_fwd.h
+++ b/libcxx/include/__pstl/backend_fwd.h
@@ -47,6 +47,7 @@ struct __backend_configuration;
 
 struct __default_backend_tag;
 struct __libdispatch_backend_tag;
+struct __openmp_backend_tag;
 struct __serial_backend_tag;
 struct __std_thread_backend_tag;
 
@@ -56,6 +57,9 @@ using __current_configuration = __backend_configuration<__serial_backend_tag, __
 using __current_configuration = __backend_configuration<__std_thread_backend_tag, __default_backend_tag>;
 #elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
 using __current_configuration = __backend_configuration<__libdispatch_backend_tag, __default_backend_tag>;
+#elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+using __current_configuration =
+    __backend_configuration<__openmp_backend_tag, __std_thread_backend_tag, __default_backend_tag>;
 #else
 
 // ...New vendors can add parallel backends here...
diff --git a/libcxx/include/__pstl/backends/openmp.h b/libcxx/include/__pstl/backends/openmp.h
new file mode 100644
index 0000000000000..158be91fb9ca9
--- /dev/null
+++ b/libcxx/include/__pstl/backends/openmp.h
@@ -0,0 +1,511 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___PSTL_BACKENDS_OPENMP_H
+#define _LIBCPP___PSTL_BACKENDS_OPENMP_H
+
+// Combined OpenMP CPU and GPU Backend
+// ===================================
+// Contrary to the CPU backends found in ./cpu_backends/, the OpenMP backend can
+// target both CPUs and GPUs. The OpenMP standard defines that when offloading
+// code to an accelerator, the compiler must generate a fallback code for
+// execution on the host. Thereby, the backend works as a CPU backend if no
+// targeted accelerator is available at execution time. The target regions can
+// also be compiled directly for a CPU architecture, for instance by adding the
+// command-line option `-fopenmp-targets=x86_64-pc-linux-gnu` in Clang.
+//
+// When is an Algorithm Offloaded?
+// -------------------------------
+// Only parallel algorithms with the parallel unsequenced execution policy are
+// offloaded to the device. We cannot offload parallel algorithms with a
+// parallel execution policy to GPUs because invocations executing in the same
+// thread "are indeterminately sequenced with respect to each other" which we
+// cannot guarantee on a GPU.
+//
+// The standard draft states that "the semantics [...] allow the implementation
+// to fall back to sequential execution if the system cannot parallelize an
+// algorithm invocation". If it is not deemed safe to offload the parallel
+// algorithm to the device, we first fall back to a parallel unsequenced
+// implementation from ./cpu_backends. The CPU implementation may then fall back
+// to sequential execution. In that way we strive to achieve the best possible
+// performance.
+//
+// Further, "it is the caller's responsibility to ensure that the invocation
+// does not introduce data races or deadlocks."
+//
+// Implicit Assumptions
+// --------------------
+// If the user provides a function pointer as an argument to a parallel
+// algorithm, it is assumed that it is the device pointer as there is currently
+// no way to check whether a host or device pointer was passed.
+//
+// Mapping Clauses
+// ---------------
+// In some of the parallel algorithms, the user is allowed to provide the same
+// iterator as input and output. The order of the maps matters because OpenMP
+// keeps a reference counter of which variables have been mapped to the device.
+// Thereby, a varible is only copied to the device if its reference counter is
+// incremented from zero, and it is only copied back to the host when the
+// reference counter is decremented to zero again.
+// This allows nesting mapped regions, for instance in recursive functions,
+// without enforcing a lot of unnecessary data movement.
+// Therefore, `pragma omp target data map(to:...)` must be used before
+// `pragma omp target data map(alloc:...)`. Conversely, the maps with map
+// modifier `release` must be placed before the maps with map modifier `from`
+// when transferring the result from the device to the host.
+//
+// Example: Assume `a` and `b` are pointers to the same array.
+// ``` C++
+// #pragma omp target enter data map(alloc:a[0:n])
+// // The reference counter is incremented from 0 to 1. a is not copied to the
+// // device because of the `alloc` map modifier.
+// #pragma omp target enter data map(to:b[0:n])
+// // The reference counter is incremented from 1 to 2. b is not copied because
+// // the reference counter is positive. Therefore b, and a, are uninitialized
+// // on the device.
+// ```
+//
+// Exceptions
+// ----------
+// Currently, GPU architectures do not handle exceptions. OpenMP target regions
+// are allowed to contain try/catch statements and throw expressions in Clang,
+// but if a throw expression is reached, it will terminate the program. That
+// does not conform to the C++ standard.
+//
+// [This document](https://eel.is/c++draft/algorithms.parallel) has been used as
+// reference for these considerations.
+
+#include <__algorithm/unwrap_iter.h>
+#include <__config>
+#include <__functional/operations.h>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/wrap_iter.h>
+#include <__pstl/backend_fwd.h>
+#include <__pstl/dispatch.h>
+#include <__type_traits/desugars_to.h>
+#include <__type_traits/is_arithmetic.h>
+#include <__type_traits/is_trivially_copyable.h>
+#include <__type_traits/remove_cvref.h>
+#include <__utility/empty.h>
+#include <__utility/forward.h>
+#include <__utility/move.h>
+#include <execution>
+#include <optional>
+
+#if !defined(_OPENMP)
+#  error "Trying to use the OpenMP PSTL backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+#elif (defined(_OPENMP) && _OPENMP < 201511)
+#  error                                                                                                               \
+      "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+namespace __pstl {
+
+// The following functions can be used to map contiguous array sections to and from the device.
+// For now, they are simple overlays of the OpenMP pragmas, but they should be updated when adding
+// support for other iterator types.
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_to([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target enter data map(to : __p[0 : __len])
+}
+
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_from([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target exit data map(from : __p[0 : __len])
+}
+
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_alloc([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target enter data map(alloc : __p[0 : __len])
+}
+
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_release([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target exit data map(release : __p[0 : __len])
+}
+
+//
+// fill
+//
+template <class _Tp, class _DifferenceType, class _Up>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_fill(_Tp* __out1, _DifferenceType __n, const _Up& __value) noexcept {
+  __pstl::__omp_map_alloc(__out1, __n);
+#pragma omp target teams distribute parallel for
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__out1 + __i) = __value;
+  __pstl::__omp_map_from(__out1, __n);
+  return __out1 + __n;
+}
+
+template <>
+struct __fill<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+  template <class _Policy, class _ForwardIterator, class _Tp>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty>
+  operator()(_Policy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Tp const& __value) const noexcept {
+    using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_trivially_copyable_v<_ValueType> &&
+                  is_trivially_copyable_v<_Tp>) {
+      __pstl::__omp_fill(std::__unwrap_iter(__first), __last - __first, __value);
+      return __empty{};
+    } else {
+      using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+      using _Fallback = __dispatch<__pstl::__fill, _Backends, __remove_cvref_t<_Policy>>;
+      return _Fallback{}(std::forward<_Policy>(__policy), std::move(__first), std::move(__last), __value);
+    }
+  }
+};
+
+//
+// find_if
+//
+template <class _Tp, class _DifferenceType, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_find_if(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
+  __pstl::__omp_map_to(__first, __n);
+  _DifferenceType __idx = __n;
+#pragma omp target teams distribute parallel for reduction(min : __idx)
+  for (_DifferenceType __i = 0; __i < __n; ++__i) {
+    if (__pred(*(__first + __i))) {
+      __idx = (__i < __idx) ? __i : __idx;
+    }
+  }
+  __pstl::__omp_map_release(__first, __n);
+  return __first + __idx;
+}
+
+template <>
+struct __find_if<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+  template <class _Policy, class _ForwardIterator, class _Predicate>
+  _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
+  operator()(_Policy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) const noexcept {
+    using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_trivially_copyable_v<_ValueType>) {
+      return std::__rewrap_iter(__first, __pstl::__omp_find_if(std::__unwrap_iter(__first), __last - __first, __pred));
+    } else {
+      using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+      using _Fallback = __dispatch<__pstl::__find_if, _Backends, __remove_cvref_t<_Policy>>;
+      return _Fallback{}(std::forward<_Policy>(__policy), std::move(__first), std::move(__last), std::move(__pred));
+    }
+  }
+};
+
+//
+// for_each
+//
+template <class _Tp, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_for_each(_Tp* __inout1, _DifferenceType __n, _Function __f) noexcept {
+  __pstl::__omp_map_to(__inout1, __n);
+#pragma omp target teams distribute parallel for
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(*(__inout1 + __i));
+  __pstl::__omp_map_from(__inout1, __n);
+  return __inout1 + __n;
+}
+
+template <>
+struct __for_each<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+  template <class _Policy, class _ForwardIterator, class _Functor>
+  _LIBCPP_HIDE_FROM_ABI optional<__empty>
+  operator()(_Policy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) const noexcept {
+    using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+                  __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_trivially_copyable_v<_ValueType>) {
+      __pstl::__omp_for_each(std::__unwrap_iter(__first), __last - __first, std::move(__func));
+      return __empty{};
+    } else {
+      using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+      using _Fallback = __dispatch<__pstl::__for_each, _Backends, __remove_cvref_t<_Policy>>;
+      return _Fallback{}(std::forward<_Policy>(__policy), std::move(__first), std::move(__last), std::move(__func));
+    }
+  }
+};
+
+//
+// transform
+//
+template <class _Tp, class _DifferenceType, class _Up, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
+  // The order of the following maps matter, as we wish to move the data. If
+  // they were placed in the reverse order, and __in equals __out, then we would
+  // allocate the buffer on the device without copying the data.
+  __pstl::__omp_map_to(__in1, __n);
+  __pstl::__omp_map_alloc(__out1, __n);
+#pragma omp target teams distribute parallel for
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__out1 + __i) = __f(*(__in1 + __i));
+  // The order of the following two maps matters, since the user could legally
+  // overwrite __in The "release" map modifier decreases the reference counter
+  // by one, and "from" only moves the data to the host, when the reference
+  // count is decremented to zero.
+  __pstl::__omp_map_release(__in1, __n);
+  __pstl::__omp_map_from(__out1, __n);
+  return __out1 + __n;
+}
+
+template <>
+struct __transform<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+  template <class _Policy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
+  _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator>
+  operator()(_Policy&& __policy,
+             _ForwardIterator __first,
+             _ForwardIterator __last,
+             _ForwardOutIterator __outit,
+             _UnaryOperation __op) const noexcept {
+    using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+                  __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value && is_trivially_copyable_v<_ValueType>) {
+      return std::__rewrap_iter(
+          __outit,
+          __omp_transform(
+              std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__outit), std::move(__op)));
+    } else {
+      using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+      using _Fallback = __dispatch<__pstl::__transform, _Backends, __remove_cvref_t<_Policy>>;
+      return _Fallback{}(
+          std::forward<_Policy>(__policy), std::move(__first), std::move(__last), std::move(__outit), std::move(__op));
+    }
+  }
+};
+
+//
+// transform_binary
+//
+template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp*
+__omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Function __f) noexcept {
+  // The order of the following maps matter, as we wish to move the data. If
+  // they were placed in the reverse order, and __out equals __in1 or __in2,
+  // then we would allocate one of the buffer on the device without copying the
+  // data.
+  __pstl::__omp_map_to(__in1, __n);
+  __pstl::__omp_map_to(__in2, __n);
+  __pstl::__omp_map_alloc(__out1, __n);
+#pragma omp target teams distribute parallel for
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__out1 + __i) = __f(*(__in1 + __i), *(__in2 + __i));
+  // The order of the following three maps matters, since the user could legally
+  // overwrite either of the inputs if __out equals __in1 or __in2. The
+  // "release" map modifier decreases the reference counter by one, and "from"
+  // only moves the data from the device, when the reference count is
+  // decremented to zero.
+  __pstl::__omp_map_release(__in1, __n);
+  __pstl::__omp_map_release(__in2, __n);
+  __pstl::__omp_map_from(__out1, __n);
+  return __out1 + __n;
+}
+
+template <>
+struct __transform_binary<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+  template <class _Policy,
+            class _ForwardIterator1,
+            class _ForwardIterator2,
+            class _ForwardOutIterator,
+            class _BinaryOperation>
+  _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator>
+  operator()(_Policy&& __policy,
+             _ForwardIterator1 __first1,
+             _ForwardIterator1 __last1,
+             _ForwardIterator2 __first2,
+             _ForwardOutIterator __outit,
+             _BinaryOperation __op) const noexcept {
+    using _ValueType1 = typename iterator_traits<_ForwardIterator1>::value_type;
+    using _ValueType2 = typename iterator_traits<_ForwardIterator2>::value_type;
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                  __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
+                  __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value && is_trivially_copyable_v<_ValueType1> &&
+                  is_trivially_copyable_v<_ValueType2>) {
+      return std::__rewrap_iter(
+          __outit,
+          __pstl::__omp_transform(
+              std::__unwrap_iter(__first1),
+              __last1 - __first1,
+              std::__unwrap_iter(__first2),
+              std::__unwrap_iter(__outit),
+              std::move(__op)));
+    } else {
+      using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+      using _Fallback = __dispatch<__pstl::__transform_binary, _Backends, __remove_cvref_t<_Policy>>;
+      return _Fallback{}(
+          std::forward<_Policy>(__policy),
+          std::move(__first1),
+          std::move(__last1),
+          std::move(__first2),
+          std::move(__outit),
+          std::move(__op));
+    }
+  }
+};
+
+//
+// transform_reduce
+//
+#define _LIBCPP_PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                              \
+  template <class _Iterator,                                                                                           \
+            class _DifferenceType,                                                                                     \
+            typename _Tp,                                                                                              \
+            typename _BinaryOperationType,                                                                             \
+            typename _UnaryOperation>                                                                                  \
+  _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                                    \
+      _Iterator __first,                                                                                               \
+      _DifferenceType __n,                                                                                             \
+      _Tp __init,                                                                                                      \
+      std_op<_BinaryOperationType> __reduce,                                                                           \
+      _UnaryOperation __transform) noexcept {                                                                          \
+    __pstl::__omp_map_to(__first, __n);                                                                                \
+_PSTL_PRAGMA(omp target teams distribute parallel for reduction(omp_op:__init))                                        \
+    for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                    \
+      __init = __reduce(__init, __transform(*(__first + __i)));                                                        \
+    __pstl::__omp_map_release(__first, __n);                                                                           \
+    return __init;                                                                                                     \
+  }
+
+#define _LIBCPP_PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                              \
+  template <class _Iterator1,                                                                                          \
+            class _Iterator2,                                                                                          \
+            class _DifferenceType,                                                                                     \
+            typename _Tp,                                                                                              \
+            typename _BinaryOperationType,                                                                             \
+            typename _UnaryOperation >                                                                                 \
+  _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                                    \
+      _Iterator1 __first1,                                                                                             \
+      _Iterator2 __first2,                                                                                             \
+      _DifferenceType __n,                                                                                             \
+      _Tp __init,                                                                                                      \
+      std_op<_BinaryOperationType> __reduce,                                                                           \
+      _UnaryOperation __transform) noexcept {                                                                          \
+    __pstl::__omp_map_to(__first1, __n);                                                                               \
+    __pstl::__omp_map_to(__first2, __n);                                                                               \
+_PSTL_PRAGMA(omp target teams distribute parallel for reduction(omp_op:__init))                                        \
+    for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                    \
+      __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                    \
+    __pstl::__omp_map_release(__first1, __n);                                                                          \
+    __pstl::__omp_map_release(__first2, __n);                                                                          \
+    return __init;                                                                                                     \
+  }
+
+#define _LIBCPP_PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                \
+  _LIBCPP_PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                    \
+  _LIBCPP_PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
+
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(+, std::plus)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(-, std::minus)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
+
+// Determine whether a reduction is supported by the OpenMP backend
+template <class _T1, class _T2, class _T3>
+struct __is_supported_reduction : std::false_type {};
+
+#define _LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(func)                                                                      \
+  template <class _Tp>                                                                                                 \
+  struct __is_supported_reduction<func<_Tp>, _Tp, _Tp> : true_type {};                                                 \
+  template <class _Tp, class _Up>                                                                                      \
+  struct __is_supported_reduction<func<>, _Tp, _Up> : true_type {};
+
+// __is_trivial_plus_operation already exists
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::plus)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::minus)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::multiplies)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::logical_and)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::logical_or)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_and)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_or)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_xor)
+
+template <>
+struct __transform_reduce<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+  template <class _Policy, class _ForwardIterator, class _Tp, class _Reduction, class _Transform>
+  _LIBCPP_HIDE_FROM_ABI optional<_Tp>
+  operator()(_Policy&& __policy,
+             _ForwardIterator __first,
+             _ForwardIterator __last,
+             _Tp __init,
+             _Reduction __reduce,
+             _Transform __transform) const noexcept {
+    using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+                  __is_supported_reduction<_Reduction, _Tp, _Tp>::value && is_trivially_copyable_v<_ValueType>) {
+      return __pstl::__omp_transform_reduce(
+          std::__unwrap_iter(__first), __last - __first, __init, std::move(__reduce), std::move(__transform));
+    } else {
+      using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+      using _Fallback = __dispatch<__pstl::__transform_reduce, _Backends, __remove_cvref_t<_Policy>>;
+      return _Fallback{}(
+          std::forward<_Policy>(__policy),
+          std::move(__first),
+          std::move(__last),
+          std::move(__init),
+          std::move(__reduce),
+          std::move(__transform));
+    }
+  }
+};
+
+//
+// transform_reduce_binary
+//
+template <>
+struct __transform_reduce_binary<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+  template <class _Policy,
+            class _ForwardIterator1,
+            class _ForwardIterator2,
+            class _Tp,
+            class _Reduction,
+            class _Transform>
+  _LIBCPP_HIDE_FROM_ABI optional<_Tp> operator()(
+      _Policy&& __policy,
+      _ForwardIterator1 __first1,
+      _ForwardIterator1 __last1,
+      _ForwardIterator2 __first2,
+      _Tp __init,
+      _Reduction __reduce,
+      _Transform __transform) const noexcept {
+    using _ValueType1 = typename iterator_traits<_ForwardIterator1>::value_type;
+    using _ValueType2 = typename iterator_traits<_ForwardIterator2>::value_type;
+    if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                  __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+                  __is_supported_reduction<_Reduction, _Tp, _Tp>::value && is_trivially_copyable_v<_ValueType1> &&
+                  is_trivially_copyable_v<_ValueType2>) {
+      return __pstl::__omp_transform_reduce(
+          std::__unwrap_iter(__first1),
+          std::__unwrap_iter(__first2),
+          __last1 - __first1,
+          std::move(__init),
+          std::move(__reduce),
+          std::move(__transform));
+    } else {
+      using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+      using _Fallback = __dispatch<__pstl::__transform_reduce_binary, _Backends, __remove_cvref_t<_Policy>>;
+      return _Fallback{}(
+          std::forward<_Policy>(__policy),
+          std::move(__first1),
+          std::move(__last1),
+          std::move(__first2),
+          std::move(__init),
+          std::move(__reduce),
+          std::move(__transform));
+    }
+  }
+};
+
+} // namespace __pstl
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___PSTL_BACKENDS_OPENMP_H
diff --git a/libcxx/include/__pstl/dispatch.h b/libcxx/include/__pstl/dispatch.h
index 5e903f7524fe9..c984c22456120 100644
--- a/libcxx/include/__pstl/dispatch.h
+++ b/libcxx/include/__pstl/dispatch.h
@@ -58,6 +58,21 @@ struct __find_first_implemented<_Algorithm, __backend_configuration<_B1, _Bn...>
 template <template <class, class> class _Algorithm, class _BackendConfiguration, class _ExecutionPolicy>
 using __dispatch = typename __find_first_implemented<_Algorithm, _BackendConfiguration, _ExecutionPolicy>::type;
 
+template <class _BackendConfiguration, class _Backend>
+struct __backends_after_impl;
+
+template <class _Backend, class... _RemainingBackends>
+struct __backends_after_impl<__backend_configuration<_Backend, _RemainingBackends...>, _Backend> {
+  using type = __backend_configuration<_RemainingBackends...>;
+};
+
+template <class _B1, class... _Bn, class _Backend>
+struct __backends_after_impl<__backend_configuration<_B1, _Bn...>, _Backend>
+    : __backends_after_impl<__backend_configuration<_Bn...>, _Backend> {};
+
+template <class _BackendConfiguration, class _Backend>
+using __backends_after = typename __backends_after_impl<_BackendConfiguration, _Backend>::type;
+
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 9ffccf66ff094..5276a14f20cb7 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1600,6 +1600,10 @@ module std_private_pstl_backends_libdispatch       [system] {
   header "__pstl/backends/libdispatch.h"
   export *
 }
+module std_private_pstl_backends_openmp            [system] {
+  header "__pstl/backends/openmp.h"
+  export *
+}
 module std_private_pstl_backends_serial            [system] {
   header "__pstl/backends/serial.h"
   export *
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
new file mode 100644
index 0000000000000..cdde2c3d0a9b9
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but std::for_each(std::execution::par_unseq,...) is not executed on
+// the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int test_size = 10000;
+  std::vector<int> v(test_size, 2);
+
+  // By making an extra map, we can control when the data is mapped to and from
+  // the device, because the map inside std::fill will then only increment and
+  // decrement reference counters and not move data.
+  int* data = v.data();
+#pragma omp target enter data map(to : data[0 : v.size()])
+  std::fill(std::execution::par_unseq, v.begin(), v.end(), -2);
+
+  // At this point v should only contain the value 2
+  for (int vi : v)
+    assert(vi == 2 &&
+           "std::fill transferred data from device to the host but should only have decreased the reference counter.");
+
+// After moving the result back to the host it should now be -2
+#pragma omp target update from(data[0 : v.size()])
+  for (int vi : v)
+    assert(vi == -2 && "std::fill did not update the result on the device.");
+
+#pragma omp target exit data map(delete : data[0 : v.size()])
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
new file mode 100644
index 0000000000000..7508d82156e54
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that std::find_if(std::execution::par_unseq,...) always
+// finds the first entry in a vector matching the condition. If it was confused
+// with std::any_of, it could return the indexes in a non-increasing order.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <execution>
+#include <vector>
+
+template <class _Tp>
+void check_find_if(_Tp& data) {
+  const int len = data.end() - data.begin();
+  // Decrementing the values in the test indices
+  int idx[11] = {0, len / 10, len / 9, len / 8, len / 7, len / 6, len / 5, len / 4, len / 3, len / 2, len - 1};
+  for (auto i : idx) {
+    data[i] -= 1;
+  };
+
+  // Asserting that the minimas are found in the correct order
+  for (auto i : idx) {
+    auto found_min = std::find_if(
+        std::execution::par_unseq, data.begin(), data.end(), [&](decltype(data[0])& n) -> bool { return n < 2; });
+    assert(found_min == (data.begin() + i));
+    // Incrementing the minimum, so the next one can be found
+    (*found_min) += 1;
+  }
+}
+
+int main(int, char**) {
+  const int test_size = 10000;
+  // Testing with vector of doubles
+  {
+    std::vector<double> v(test_size, 2.0);
+    check_find_if(v);
+  }
+  // Testing with vector of integers
+  {
+    std::vector<int> v(test_size, 2);
+    check_find_if(v);
+  }
+  // Testing with array of doubles
+  {
+    std::array<double, test_size> a;
+    a.fill(2.0);
+    check_find_if(a);
+  }
+  // Testing with array of integers
+  {
+    std::array<int, test_size> a;
+    a.fill(2);
+    check_find_if(a);
+  }
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
new file mode 100644
index 0000000000000..b84dd68ad3f1b
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that we can provide function pointers as input to
+// std::find_if. The OpenMP declare target directive with the `indirect` clause
+// makes an implicit mapping of the host and device function pointers.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+bool is_odd(int& i) { return (i % 2) == 1; }
+#pragma omp declare target indirect to(is_odd)
+
+int main(int, char**) {
+  const int test_size = 10000;
+  std::vector<int> v(test_size, 2.0);
+  v[123] = 3;
+
+  // Providing for_each a function pointer
+  auto idx = std::find_if(std::execution::par_unseq, v.begin(), v.end(), is_odd);
+
+  assert(idx - v.begin() == 123 && "std::find_if(std::execution::par_unseq,...) does not accept function pointers");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
new file mode 100644
index 0000000000000..7afed4740a0f8
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but syd::find_if(std::execution::par_unseq,...) is not executed on
+// the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int test_size = 10000;
+  std::vector<double> v(test_size, 1);
+
+  auto idx = std::find_if(std::execution::par_unseq, v.begin(), v.end(), [](double&) -> bool {
+    // Returns true if executed on the host
+    return omp_is_initial_device();
+  });
+  assert(idx == v.end() &&
+         "omp_is_initial_device() returned true in the target region. std::find_if was not offloaded.");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
new file mode 100644
index 0000000000000..4b9c23860800c
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that we can provide function pointers as input to
+// std::for_each. The OpenMP declare target directive with the `indirect` clause
+// makes an implicit mapping of the host and device function pointers.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+void cube(double& d) { d *= d * d; }
+#pragma omp declare target indirect to(cube)
+
+int main(int, char**) {
+  const int test_size = 10000;
+  std::vector<double> v(test_size, 2.0);
+
+  // Providing for_each a function pointer
+  std::for_each(std::execution::par_unseq, v.begin(), v.end(), cube);
+
+  for (int vi : v)
+    assert(vi == 8 && "std::for_each(std::execution::par_unseq,...) does not accept function pointers");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
new file mode 100644
index 0000000000000..e0b0f18df2726
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that we can provide a lambda as input to std::for_each in
+// different ways.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+template <class Function, class Tp>
+void test_lambda(Function fun, Tp initial_value, Tp final_value) {
+  const int test_size = 10000;
+  std::vector<double> v(test_size, initial_value);
+
+  // Providing for_each a function pointer
+  std::for_each(std::execution::par_unseq, v.begin(), v.end(), fun);
+
+  for (int vi : v)
+    assert(vi == final_value && "std::for_each(std::execution::par_unseq,...) does not accept lambdas");
+}
+
+int main(int, char**) {
+  // Capturing by reference
+  auto cube_ref = [&](double& a) { a *= a * a; };
+  test_lambda(cube_ref, 2.0, 8.0);
+
+  // Capturing by value
+  auto cube_val = [=](double& a) { a *= a * a; };
+  test_lambda(cube_val, 2.0, 8.0);
+
+  // Capturing by reference when using additional input
+  double c1       = 1.0;
+  auto cube_ref_2 = [&](double& a) { a = a * a * a + c1; };
+#pragma omp target data map(to : c1)
+  test_lambda(cube_ref_2, 2.0, 9.0);
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
new file mode 100644
index 0000000000000..4021e7b18e5f4
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but for_each(std::execution::par_unseq,...) is not executed on the
+// device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int test_size = 10000;
+  std::vector<int> v(test_size);
+  std::for_each(std::execution::par_unseq, v.begin(), v.end(), [](int& n) {
+    // Returns true if executed on the host
+    n = omp_is_initial_device();
+  });
+
+  for (int vi : v)
+    assert(vi == 0 && "omp_is_initial_device() returned true in the target region. std::for_each was not offloaded.");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
new file mode 100644
index 0000000000000..74f177b75e69c
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that you can overwrite the input in
+// std::for_each(std::execution::par_unseq,...). If the result was not copied
+// back from the device to the host, this test would fail.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <execution>
+#include <vector>
+
+template <class _Tp, class _Predicate, class _Up>
+void overwrite(_Tp& data, _Predicate pred, const _Up& value) {
+  // This function assumes that pred will never be the identity transformation
+
+  // Updating the array with a lambda
+  std::for_each(std::execution::par_unseq, data.begin(), data.end(), pred);
+
+  // Asserting that no elements have the intial value
+  for (int di : data)
+    assert(
+        di != value &&
+        "The GPU implementation of std::for_each does not allow users to mutate the input as the C++ standard does.");
+}
+
+int main(int, char**) {
+  const double value  = 2.0;
+  const int test_size = 10000;
+  // Testing with vector of doubles
+  {
+    std::vector<double> v(test_size, value);
+    overwrite(v, [&](double& n) { n *= n; }, value);
+  }
+  // Testing with vector of integers
+  {
+    std::vector<int> v(test_size, (int)value);
+    overwrite(v, [&](int& n) { n *= n; }, (int)value);
+  }
+  // Testing with array of doubles
+  {
+    std::array<double, test_size> a;
+    a.fill(value);
+    overwrite(a, [&](double& n) { n *= n; }, value);
+  }
+  // Testing with array of integers
+  {
+    std::array<int, test_size> a;
+    a.fill((int)value);
+    overwrite(a, [&](int& n) { n *= n; }, (int)value);
+  }
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
new file mode 100644
index 0000000000000..801693a3d1eb8
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that the libc++ test configuration forwards the AMD and
+// NVIDIA environment variables specifying the visible devices. Intially when
+// developing the OpenMP offloading tests, this was not the case, and this test
+// will reveal if the configuration is wrong another time.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <string>
+#include <cassert>
+#include <omp.h>
+#include <iostream>
+
+std::string get_env_var(std::string const& env_var_name, int& flag) {
+  char* val;
+  val                = getenv(env_var_name.c_str());
+  std::string retval = "";
+  flag               = (val != NULL);
+  return (val != NULL) ? val : "";
+}
+
+int main(int, char**) {
+  // Stores whether the environment variable was found
+  int status = 0;
+
+  // Checking for AMD's enviroment variable for specifying visible devices
+  std::string rocr_visible_devices = get_env_var("ROCR_VISIBLE_DEVICES", status);
+  if (status)
+    assert(
+        (rocr_visible_devices.empty() || (omp_get_num_devices() > 0)) &&
+        "ROCR_VISIBLE_DEVICES was set but no devices were detected by OpenMP. The libc++ test suite is misconfigured.");
+
+  // Checking for NVIDIA's enviroment variable for specifying visible devices
+  std::string cuda_visible_devices = get_env_var("CUDA_VISIBLE_DEVICES", status);
+  if (status)
+    assert(
+        (cuda_visible_devices.empty() || (omp_get_num_devices() > 0)) &&
+        "CUDA_VISIBLE_DEVICES was set but no devices were detected by OpenMP. The libc++ test suite is misconfigured.");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp
new file mode 100644
index 0000000000000..85b2656788cf3
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that a diagnostic error is prompted if the OpenMP version is below
+// the minimum required version.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=40
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+
+// expected-error at __algorithm/pstl_backends/openmp/backend.h:26 {{"OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."}}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp
new file mode 100644
index 0000000000000..4765ca0d540b3
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that one can include algorithm without any diagnostics when using
+// the minimum required version of OpenMP.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=45
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+
+// expected-no-diagnostics
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp
new file mode 100644
index 0000000000000..b7836cb942548
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that one can include algorithm without any diagnostics when using a
+// version that is newer than the minimum requirement.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=51
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+
+// expected-no-diagnostics
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
new file mode 100644
index 0000000000000..f660bc23bbbf7
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but std::transform(std::execution::par_unseq,...) is not executed
+// on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test arrays
+  const int test_size = 10000;
+  std::vector<int> host(test_size);
+  std::vector<int> device(test_size);
+  // Should execute on host
+  std::transform(std::execution::unseq, host.begin(), host.end(), host.begin(), [](int& h) {
+    // Returns true if executed on the host
+    h = omp_is_initial_device();
+    return h;
+  });
+
+  // Asserting the std::transform(std::execution::unseq,...) executed on the host
+  for (int hi : host)
+    assert(hi && "omp_is_initial_device() returned false. std::transform was offloaded but shouldn't be.");
+
+  // Should execute on device
+  std::transform(
+      std::execution::par_unseq, device.begin(), device.end(), host.begin(), device.begin(), [](int& d, int& h) {
+        // Should return fals
+        d = omp_is_initial_device();
+        return h == d;
+      });
+
+  // Asserting the std::transform(std::execution::par_unseq,...) executed on the device
+  for (int di : device)
+    assert(!di && "omp_is_initial_device() returned true in the target region. std::transform was not offloaded.");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
new file mode 100644
index 0000000000000..bf059fadf7063
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but std::transform_reduce(std::execution::par_unseq,...) is not
+// executed on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <functional>
+#include <vector>
+#include <omp.h>
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int test_size = 10000;
+  std::vector<int> v(test_size, 1);
+  std::vector<int> w(test_size, 1);
+
+  int result = std::transform_reduce(
+      std::execution::par_unseq, v.begin(), v.end(), w.begin(), (int)0, std::plus{}, [](int& n, int& m) {
+        return n + m + omp_is_initial_device(); // Gives 2 if executed on device, 3 if executed on host
+      });
+  assert(result == 2 * test_size &&
+         "omp_is_initial_device() returned true in the target region. std::transform_reduce was not offloaded.");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
new file mode 100644
index 0000000000000..fb324d6db69db
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
@@ -0,0 +1,199 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that std::transform_reduce(std::execution::par_unseq,...)
+// can be offloaded for a number of supported binary operations. The following
+// binary operations should be supported for the reducer:
+// - std::plus
+// - std::minus
+// - std::multiplies
+// - std::logical_and
+// - std::logical_or
+// - std::bit_and
+// - std::bit_or
+// - std::bit_xor
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <execution>
+#include <functional>
+#include <vector>
+#include <omp.h>
+#include <iostream>
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int test_size = 10000;
+
+  //===--------------------------------------------------------------------===//
+  // Arithmetic binary operators
+  //===--------------------------------------------------------------------===//
+
+  // Addition with doubles
+  {
+    std::vector<double> v(test_size, 1.0);
+    std::vector<double> w(test_size, 2.0);
+    double result = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), w.begin(), 5.0, std::plus{}, [](double& a, double& b) {
+          return 0.5 * (b - a) * ((double)!omp_is_initial_device());
+        });
+    assert((std::abs(result - 0.5 * ((double)test_size) - 5.0) < 1e-8) &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::plus.");
+  }
+
+  // Subtraction of floats
+  {
+    std::vector<float> v(test_size, 1.0f);
+    std::vector<float> w(test_size, 1.5f);
+    float result = std::transform_reduce(
+        std::execution::par_unseq,
+        v.begin(),
+        v.end(),
+        w.begin(),
+        1.25 * ((float)test_size),
+        std::minus{},
+        [](float& a, float& b) { return 0.5 * (a + b) * ((float)!omp_is_initial_device()); });
+    assert((std::abs(result) < 1e-8f) &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the "
+           "intended effect for the binary operation std::minus.");
+  }
+
+  // Multiplication of doubles
+  {
+    std::vector<double> v(test_size, 1.0);
+    std::vector<double> w(test_size, 0.0001);
+    double result = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), w.begin(), -1.0, std::multiplies{}, [](double& a, double& b) {
+          return (a + b) * ((double)!omp_is_initial_device());
+        });
+    assert((std::abs(result + pow(1.0001, test_size)) < 1e-8) &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::multiplies.");
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Logical binary operators
+  //===--------------------------------------------------------------------===//
+
+  // Logical and
+  {
+    std::vector<int> v(test_size, 1);
+    // The result should be true with an initial value of 1
+    int result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 1, std::logical_and{}, [](int& a) {
+          return a && !omp_is_initial_device();
+        });
+    assert(result &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::logical_and.");
+
+    // And false by an initial value of 0
+    result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::logical_and{}, [](int& a) {
+      return a && !omp_is_initial_device();
+    });
+    assert(!result &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::logical_and.");
+  }
+
+  // Logical or
+  {
+    std::vector<int> v(test_size, 0);
+    // The result should be true with an initial value of 1
+    int result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 1, std::logical_or{}, [](int& a) {
+      return a && !omp_is_initial_device();
+    });
+    assert(result &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::logical_or.");
+
+    // And false by an initial value of 0
+    result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::logical_or{}, [](int& a) {
+      return a && !omp_is_initial_device();
+    });
+    assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                      "binary operation std::logical_or.");
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Bitwise binary operators
+  //===--------------------------------------------------------------------===//
+
+  // Bitwise and
+  {
+    std::vector<unsigned int> v(test_size, 3);
+    std::vector<unsigned int> w(test_size, 2);
+    // For odd numbers the result should be true
+    int result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0x1, std::bit_and{}, [](unsigned int& a) {
+          return a + omp_is_initial_device();
+        });
+    assert(result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                     "binary operation std::bit_and.");
+
+    // For even numbers the result should be false
+    result =
+        std::transform_reduce(std::execution::par_unseq, w.begin(), w.end(), 0x1, std::bit_and{}, [](unsigned int& a) {
+          return a + omp_is_initial_device();
+        });
+    assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                      "binary operation std::bit_and.");
+  }
+
+  // Bitwise or
+  {
+    std::vector<unsigned int> v(test_size, 0);
+    int result = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) -> unsigned int {
+          return a || omp_is_initial_device();
+        });
+    assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                      "binary operation std::bit_or.");
+
+    // After adding a one, the result should be true
+    v[v.size() / 2] = 1;
+    result          = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) -> unsigned int {
+          return a && !omp_is_initial_device();
+        });
+    assert(result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                     "binary operation std::bit_or.");
+  }
+
+  // Bitwise xor
+  {
+    std::vector<unsigned int> v(test_size, 0xef);
+    int result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_xor{}, [](unsigned int& a) {
+          return a << omp_is_initial_device();
+        });
+    assert(result == 0 && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for "
+                          "the binary operation std::bit_or.");
+
+    // After adding a one, the result should be true
+    v[v.size() / 2] = 0xea;
+    result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_xor{}, [](unsigned int& a) {
+          return a << omp_is_initial_device();
+        });
+    assert(result == 5 && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for "
+                          "the binary operation std::bit_or.");
+  }
+
+  return 0;
+}
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index e40c2b635ef90..842ff584ddc15 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -130,6 +130,26 @@ function generate-cmake() {
           "${@}"
 }
 
+function generate-cmake-openmp() {
+    echo "--- Generating CMake"
+    ${CMAKE} \
+          -S "${MONOREPO_ROOT}/llvm" \
+          -B "${BUILD_DIR}" \
+          -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \
+          -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+          -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
+          -DLIBCXX_ENABLE_WERROR=YES \
+          -DLIBCXXABI_ENABLE_WERROR=YES \
+          -DLIBUNWIND_ENABLE_WERROR=YES \
+          -DLIBCXX_ENABLE_CLANG_TIDY=${ENABLE_CLANG_TIDY} \
+          -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \
+          -DLLVM_ENABLE_PROJECTS="clang;openmp" \
+          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind;offload" \
+          -DLIBCXX_PSTL_BACKEND="openmp" \
+          -DLIBCXX_CXX_ABI=libcxxabi \
+          "${@}"
+}
+
 function generate-cmake-libcxx-win() {
     generate-cmake-base \
           -DLLVM_ENABLE_RUNTIMES="libcxx" \
@@ -514,6 +534,18 @@ generic-optimized-speed)
     generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-optimized-speed.cmake"
     check-runtimes
 ;;
+generic-pstl-openmp)
+    clean
+    # TODO: Pass different host and device triples to the test configuration.
+    # For now, the OpenMP buildbot offloads to the host. To set the correct host target triple,
+    # we pass it via the LIBCXX test parameters.
+    PARAMS="target_triple=$(${CXX} --print-target-triple)"
+    generate-cmake-openmp -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-pstl-openmp.cmake" \
+                          ${MONOREPO_ROOT}/llvm -DLIBCXX_TEST_PARAMS="${PARAMS}"
+    echo "+++ Installing OpenMP and Clang"
+    ${NINJA} -vC "${BUILD_DIR}" install
+    check-runtimes
+;;
 apple-system)
     clean
 
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 7a9631a56e4bb..f516da9c552bc 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -359,6 +359,7 @@ def _mingwSupportsModules(cfg):
     "_LIBCPP_HAS_NO_TIME_ZONE_DATABASE": "no-tzdb",
     "_LIBCPP_HAS_NO_UNICODE": "libcpp-has-no-unicode",
     "_LIBCPP_PSTL_BACKEND_LIBDISPATCH": "libcpp-pstl-backend-libdispatch",
+    "_LIBCPP_PSTL_BACKEND_OPENMP": "openmp_pstl_backend",
 }
 for macro, feature in macros.items():
     DEFAULT_FEATURES.append(
@@ -368,6 +369,27 @@ def _mingwSupportsModules(cfg):
         )
     )
 
+DEFAULT_FEATURES.append(
+    Feature(
+        name="libcpp-pstl-backend-openmp",
+        when=lambda cfg: "_LIBCPP_PSTL_BACKEND_OPENMP" in compilerMacros(cfg),
+        actions=[
+            AddFlagIfSupported("-fopenmp"),
+            # The linker needs to find the correct version of libomptarget
+            AddLinkFlag("-Wl,-rpath,%{lib-dir}"),
+            # The preprocessor needs to find the omp.h header. If OpenMP was
+            # installed as a project, the header lives in the following
+            # directory
+            AddFlag("-I %{lib-dir}/../../projects/openmp/runtime/src/"),
+            # And if it was installed as a runtime it lives in the following:
+            AddFlag("-I %{lib-dir}/../../runtimes/runtimes-bins/openmp/runtime/src"),
+            # TODO: Add conditional to test if a GPU target has been detected.
+            # For now, we only offload to the host in this test configuration.
+            # If a GPU were present, we should instead pass --offload-arch=native.
+            AddFlagIfSupported("-fopenmp-targets=%{triple}"),
+        ],
+    )
+)
 
 # Mapping from canonical locale names (used in the tests) to possible locale
 # names on various systems. Each locale is considered supported if any of the
diff --git a/libcxx/utils/run.py b/libcxx/utils/run.py
index 6b4d615444bcf..427e96b91fa37 100755
--- a/libcxx/utils/run.py
+++ b/libcxx/utils/run.py
@@ -64,6 +64,21 @@ def main():
         if "TEMP" in os.environ:
             env["TEMP"] = os.environ.get("TEMP")
 
+    # Forwarding the environment variable CUDA_VISIBLE_DEVICES which configures
+    # the visible NVIDIA GPUs.
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        env["CUDA_VISIBLE_DEVICES"] = os.environ["CUDA_VISIBLE_DEVICES"]
+
+    # Forwarding the environment variable ROCR_VISIBLE_DEVICES which configures
+    # the visible AMD GPUs.
+    if "ROCR_VISIBLE_DEVICES" in os.environ:
+        env["ROCR_VISIBLE_DEVICES"] = os.environ["ROCR_VISIBLE_DEVICES"]
+
+    # Pass the OpenMP debug flag. Can be used to print information about the
+    # GPU execution of the tests.
+    if "LIBOMPTARGET_DEBUG" in os.environ:
+        env["LIBOMPTARGET_DEBUG"] = os.environ["LIBOMPTARGET_DEBUG"]
+
     # Run the command line with the given environment in the execution directory.
     return subprocess.call(commandLine, cwd=args.execdir, env=env, shell=False)
 
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index 52ba52f3439fb..01fa382ab93f0 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -414,6 +414,14 @@ if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
   add_definitions("-D_XOPEN_SOURCE=700")
 endif()
 
+# If the OpenMP PSTL backend has been enabled for libcxx, OpenMP must be
+# enabled during compilation
+if (DEFINED LIBCXX_PSTL_BACKEND)
+  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+    add_compile_options(-fopenmp)
+  endif()
+endif()
+
 #===============================================================================
 # Setup Source Code
 #===============================================================================