[libcxx-commits] [libcxx] [libcxxabi] [llvm] Adding Separate OpenMP Offloading Backend to `libcxx/include/__algorithm/pstl_backends` (PR #66968)
Vedant Tewari via libcxx-commits
libcxx-commits at lists.llvm.org
Wed Aug 14 18:21:16 PDT 2024
https://github.com/xevor11 updated https://github.com/llvm/llvm-project/pull/66968
>From 10d408f5335e7e61065118f86bf1bcdc1287aefa Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 20 Sep 2023 17:06:10 -0700
Subject: [PATCH 1/2] Adding OpenMP Offloading Backend for C++ Parallel
Algorithms
---
.github/workflows/libcxx-build-and-test.yaml | 1 +
libcxx/CMakeLists.txt | 12 +-
libcxx/cmake/caches/Generic-pstl-openmp.cmake | 1 +
libcxx/docs/BuildingLibcxx.rst | 11 +
libcxx/docs/UsingLibcxx.rst | 104 ++++
libcxx/include/CMakeLists.txt | 1 +
libcxx/include/__config_site.in | 1 +
libcxx/include/__pstl/backend.h | 4 +
libcxx/include/__pstl/backend_fwd.h | 4 +
libcxx/include/__pstl/backends/openmp.h | 511 ++++++++++++++++++
libcxx/include/__pstl/dispatch.h | 15 +
libcxx/include/module.modulemap | 4 +
.../alg.pstl.openmp/fill_offload.pass.cpp | 52 ++
.../alg.pstl.openmp/find_if.pass.cpp | 67 +++
.../alg.pstl.openmp/find_if_funptr.pass.cpp | 36 ++
.../alg.pstl.openmp/find_if_offload.pass.cpp | 39 ++
.../alg.pstl.openmp/for_each_funptr.pass.cpp | 36 ++
.../alg.pstl.openmp/for_each_lambda.pass.cpp | 49 ++
.../alg.pstl.openmp/for_each_offload.pass.cpp | 39 ++
.../for_each_overwrite_input.pass.cpp | 63 +++
.../gpu_environment_variables.pass.cpp | 49 ++
.../openmp_version_40.verify.cpp | 21 +
.../openmp_version_45.verify.cpp | 21 +
.../openmp_version_51.verify.cpp | 21 +
.../transform_offload.pass.cpp | 55 ++
.../transform_reduce_offload.pass.cpp | 41 ++
...educe_supported_binary_operations.pass.cpp | 199 +++++++
libcxx/utils/ci/run-buildbot | 32 ++
libcxx/utils/libcxx/test/features.py | 22 +
libcxx/utils/run.py | 15 +
libcxxabi/CMakeLists.txt | 8 +
31 files changed, 1532 insertions(+), 2 deletions(-)
create mode 100644 libcxx/cmake/caches/Generic-pstl-openmp.cmake
create mode 100644 libcxx/include/__pstl/backends/openmp.h
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index d7c21394ca486f..2f25e3b0c47dc0 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -153,6 +153,7 @@ jobs:
'generic-no-wide-characters',
'generic-no-rtti',
'generic-optimized-speed',
+ 'generic-pstl-openmp',
'generic-static',
# TODO Find a better place for the benchmark and bootstrapping builds to live. They're either very expensive
# or don't provide much value since the benchmark run results are too noise on the bots.
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 4b927017f8c2ac..c1dec6bf54ab8e 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -301,10 +301,11 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API
This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
if (LIBCXX_ENABLE_THREADS)
- set(LIBCXX_PSTL_BACKEND "std_thread" CACHE STRING "Which PSTL backend to use")
+ set(LIBCXX_PSTL_BACKEND_DEFAULT "std_thread")
else()
- set(LIBCXX_PSTL_BACKEND "serial" CACHE STRING "Which PSTL backend to use")
+ set(LIBCXX_PSTL_BACKEND_DEFAULT "serial")
endif()
+set(LIBCXX_PSTL_BACKEND "${LIBCXX_PSTL_BACKEND_DEFAULT}" CACHE STRING "Select the PSTL backend to use. Valid values are serial, std-thread, libdispatch, openmp. Default: ${LIBCXX_PSTL_BACKEND_DEFAULT}")
# Misc options ----------------------------------------------------------------
# FIXME: Turn -pedantic back ON. It is currently off because it warns
@@ -571,6 +572,11 @@ function(cxx_add_basic_build_flags target)
endif()
endif()
target_compile_options(${target} PUBLIC "${LIBCXX_ADDITIONAL_COMPILE_FLAGS}")
+
+ # If the PSTL backend depends on OpenMP, we must enable the OpenMP tool chain
+ if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+ target_add_compile_flags_if_supported(${target} PUBLIC -fopenmp)
+ endif()
endfunction()
# Exception flags =============================================================
@@ -800,6 +806,8 @@ elseif(LIBCXX_PSTL_BACKEND STREQUAL "std_thread")
config_define(1 _LIBCPP_PSTL_BACKEND_STD_THREAD)
elseif(LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
config_define(1 _LIBCPP_PSTL_BACKEND_LIBDISPATCH)
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+ config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
else()
message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
Valid backends are: serial, std_thread and libdispatch")
diff --git a/libcxx/cmake/caches/Generic-pstl-openmp.cmake b/libcxx/cmake/caches/Generic-pstl-openmp.cmake
new file mode 100644
index 00000000000000..f3ff4f3b57fd21
--- /dev/null
+++ b/libcxx/cmake/caches/Generic-pstl-openmp.cmake
@@ -0,0 +1 @@
+set(LIBCXX_PSTL_BACKEND openmp CACHE STRING "")
diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst
index e425b9dadfe7d1..5727005e24fbde 100644
--- a/libcxx/docs/BuildingLibcxx.rst
+++ b/libcxx/docs/BuildingLibcxx.rst
@@ -424,6 +424,17 @@ libc++ Feature Options
provided, this header will be included by the library, replacing the
default assertion handler.
+.. option:: LIBCXX_PSTL_BACKEND:STRING
+
+ **Default**:: ``"serial"``
+
+ **Values**:: ``serial``, ``std-thread``, ``libdispatch``, ``openmp``
+
+ Select the desired backend for C++ parallel algorithms. All four options can
+ target multi-core CPU architectures, and ``openmp`` can additionally target
+ GPU architectures. The ``openmp`` backend requires OpenMP version 4.5 or
+ later.
+
libc++ ABI Feature Options
--------------------------
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index df08875c13beae..f1e7b19ead5798 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -364,6 +364,110 @@ Unpoisoning may not be an option, if (for example) you are not maintaining the a
* You are using allocator, which does not call destructor during deallocation.
* You are aware that memory allocated with an allocator may be accessed, even when unused by container.
+Offloading C++ Parallel Algorithms to GPUs
+------------------------------------------
+
+Experimental support for GPU offloading has been added to ``libc++``. The
+implementation uses OpenMP target offloading to leverage GPU compute resources.
+The OpenMP PSTL backend can target both NVIDIA and AMD GPUs.
+However, the implementation only supports contiguous iterators, such as
+iterators for ``std::vector`` or ``std::array``.
+To enable the OpenMP offloading backend it must be selected with
+``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when
+compiling a program, the user must specify the command line options
+``-fopenmp -fexperimental-library``. To install LLVM with OpenMP offloading
+enabled, please read
+`the LLVM OpenMP FAQ. <https://openmp.llvm.org/SupportAndFAQ.html>`_
+You may also want to to visit
+`the OpenMP offloading command-line argument reference. <https://openmp.llvm.org/CommandLineArgumentReference.html#offload-command-line-arguments>`_
+
+Example
+~~~~~~~
+
+The following is an example of offloading vector addition to a GPU using our
+standard library extension. It implements the classical vector addition from
+BLAS that overwrites the vector ``y`` with ``y=a*x+y``. Thus ``y.begin()`` is
+both used as an input and an output iterator in this example.
+
+.. code-block:: cpp
+
+ #include <algorithm>
+ #include <execution>
+
+ template <typename T1, typename T2, typename T3>
+ void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+ std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+ y.begin(), [=](T2 xi, T3 yi) { return a * xi + yi; });
+ }
+
+The execution policy ``std::execution::par_unseq`` states that the algorithm's
+execution may be parallelized, vectorized, and migrated across threads. This is
+the only execution mode that is safe to offload to GPUs, and for all other
+execution modes the algorithms will execute on the CPU.
+Special attention must be paid to the lambda captures when enabling GPU
+offloading. If the lambda captures by reference, the user must manually map the
+variables to the device. If capturing by reference, the above example could
+be implemented in the following way.
+
+.. code-block:: cpp
+
+ template <typename T1, typename T2, typename T3>
+ void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+ #pragma omp target data map(to : a)
+ std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+ y.begin(), [&](T2 xi, T3 yi) { return a * xi + yi; });
+ }
+
+However, if unified shared memory, USM, is enabled, no additional data mapping
+is necessary when capturing y reference.
+
+Compiling functions for GPUs with OpenMP
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The C++ standard defines that all accesses to memory are inside a single address
+space. However, discrete GPU systems have distinct address spaces. A single
+address space can be emulated if your system supports unified shared memory.
+However, many discrete GPU systems do not, and in those cases it is important to
+pass device function pointers to the parallel algorithms. Below is an example of
+how the OpenMP ``declare target`` directive with the ``indirect`` clause can be
+used to mark that a function should be compiled for both host and device.
+
+.. code-block:: cpp
+
+ // This function computes the squared difference of two floating points
+ float squared(float a, float b) { return a * a - 2.0f * a * b + b * b; };
+
+ // Declare that the function must be compiled for both host and device
+ #pragma omp declare target indirect to(squared)
+
+ int main() {
+ std::vector<float> a(100, 1.0);
+ std::vector<float> b(100, 1.25);
+
+ // Pass the host function pointer to the parallel algorithm and let OpenMP
+ // translate it to the device function pointer internally
+ float sum =
+ std::transform_reduce(std::execution::par_unseq, a.begin(), a.end(),
+ b.begin(), 0.0f, std::plus{}, squared);
+
+ // Validate that the result is approximately 6.25
+ assert(std::abs(sum - 6.25f) < 1e-10);
+ return 0;
+ }
+
+Without unified shared memory, the above example will not work if the host
+function pointer ``squared`` is passed to the parallel algorithm.
+
+Important notes about exception handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+GPU architectures do not support exception handling. If compiling a program
+containing parallel algorithms with current versions of Clang, a program with
+exceptions in offloaded code regions will compile, but the program will
+terminate if an exception is thrown on the device. This does not conform with
+the C++ standard and exception handling on GPUs will hopefully be better
+supported in future releases of LLVM.
+
Platform specific behavior
==========================
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 8d0ffd6ed725bd..f641abc38bb816 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -579,6 +579,7 @@ set(files
__pstl/backend_fwd.h
__pstl/backends/default.h
__pstl/backends/libdispatch.h
+ __pstl/backends/openmp.h
__pstl/backends/serial.h
__pstl/backends/std_thread.h
__pstl/cpu_algos/any_of.h
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index 89a14609ee3f92..00693aeb919cb4 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -35,6 +35,7 @@
#cmakedefine _LIBCPP_PSTL_BACKEND_SERIAL
#cmakedefine _LIBCPP_PSTL_BACKEND_STD_THREAD
#cmakedefine _LIBCPP_PSTL_BACKEND_LIBDISPATCH
+#cmakedefine _LIBCPP_PSTL_BACKEND_OPENMP
// Hardening.
#cmakedefine _LIBCPP_HARDENING_MODE_DEFAULT @_LIBCPP_HARDENING_MODE_DEFAULT@
diff --git a/libcxx/include/__pstl/backend.h b/libcxx/include/__pstl/backend.h
index 86d9f28c77fa8c..cb47501c19fc88 100644
--- a/libcxx/include/__pstl/backend.h
+++ b/libcxx/include/__pstl/backend.h
@@ -28,6 +28,10 @@ _LIBCPP_PUSH_MACROS
#elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
# include <__pstl/backends/default.h>
# include <__pstl/backends/libdispatch.h>
+#elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+# include <__pstl/backends/default.h>
+# include <__pstl/backends/openmp.h>
+# include <__pstl/backends/std_thread.h>
#endif
_LIBCPP_POP_MACROS
diff --git a/libcxx/include/__pstl/backend_fwd.h b/libcxx/include/__pstl/backend_fwd.h
index 32c5da576fb3c0..ed08d45206a8b8 100644
--- a/libcxx/include/__pstl/backend_fwd.h
+++ b/libcxx/include/__pstl/backend_fwd.h
@@ -47,6 +47,7 @@ struct __backend_configuration;
struct __default_backend_tag;
struct __libdispatch_backend_tag;
+struct __openmp_backend_tag;
struct __serial_backend_tag;
struct __std_thread_backend_tag;
@@ -56,6 +57,9 @@ using __current_configuration = __backend_configuration<__serial_backend_tag, __
using __current_configuration = __backend_configuration<__std_thread_backend_tag, __default_backend_tag>;
#elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
using __current_configuration = __backend_configuration<__libdispatch_backend_tag, __default_backend_tag>;
+#elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+using __current_configuration =
+ __backend_configuration<__openmp_backend_tag, __std_thread_backend_tag, __default_backend_tag>;
#else
// ...New vendors can add parallel backends here...
diff --git a/libcxx/include/__pstl/backends/openmp.h b/libcxx/include/__pstl/backends/openmp.h
new file mode 100644
index 00000000000000..158be91fb9ca99
--- /dev/null
+++ b/libcxx/include/__pstl/backends/openmp.h
@@ -0,0 +1,511 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___PSTL_BACKENDS_OPENMP_H
+#define _LIBCPP___PSTL_BACKENDS_OPENMP_H
+
+// Combined OpenMP CPU and GPU Backend
+// ===================================
+// Contrary to the CPU backends found in ./cpu_backends/, the OpenMP backend can
+// target both CPUs and GPUs. The OpenMP standard defines that when offloading
+// code to an accelerator, the compiler must generate a fallback code for
+// execution on the host. Thereby, the backend works as a CPU backend if no
+// targeted accelerator is available at execution time. The target regions can
+// also be compiled directly for a CPU architecture, for instance by adding the
+// command-line option `-fopenmp-targets=x86_64-pc-linux-gnu` in Clang.
+//
+// When is an Algorithm Offloaded?
+// -------------------------------
+// Only parallel algorithms with the parallel unsequenced execution policy are
+// offloaded to the device. We cannot offload parallel algorithms with a
+// parallel execution policy to GPUs because invocations executing in the same
+// thread "are indeterminately sequenced with respect to each other" which we
+// cannot guarantee on a GPU.
+//
+// The standard draft states that "the semantics [...] allow the implementation
+// to fall back to sequential execution if the system cannot parallelize an
+// algorithm invocation". If it is not deemed safe to offload the parallel
+// algorithm to the device, we first fall back to a parallel unsequenced
+// implementation from ./cpu_backends. The CPU implementation may then fall back
+// to sequential execution. In that way we strive to achieve the best possible
+// performance.
+//
+// Further, "it is the caller's responsibility to ensure that the invocation
+// does not introduce data races or deadlocks."
+//
+// Implicit Assumptions
+// --------------------
+// If the user provides a function pointer as an argument to a parallel
+// algorithm, it is assumed that it is the device pointer as there is currently
+// no way to check whether a host or device pointer was passed.
+//
+// Mapping Clauses
+// ---------------
+// In some of the parallel algorithms, the user is allowed to provide the same
+// iterator as input and output. The order of the maps matters because OpenMP
+// keeps a reference counter of which variables have been mapped to the device.
+// Thereby, a varible is only copied to the device if its reference counter is
+// incremented from zero, and it is only copied back to the host when the
+// reference counter is decremented to zero again.
+// This allows nesting mapped regions, for instance in recursive functions,
+// without enforcing a lot of unnecessary data movement.
+// Therefore, `pragma omp target data map(to:...)` must be used before
+// `pragma omp target data map(alloc:...)`. Conversely, the maps with map
+// modifier `release` must be placed before the maps with map modifier `from`
+// when transferring the result from the device to the host.
+//
+// Example: Assume `a` and `b` are pointers to the same array.
+// ``` C++
+// #pragma omp target enter data map(alloc:a[0:n])
+// // The reference counter is incremented from 0 to 1. a is not copied to the
+// // device because of the `alloc` map modifier.
+// #pragma omp target enter data map(to:b[0:n])
+// // The reference counter is incremented from 1 to 2. b is not copied because
+// // the reference counter is positive. Therefore b, and a, are uninitialized
+// // on the device.
+// ```
+//
+// Exceptions
+// ----------
+// Currently, GPU architectures do not handle exceptions. OpenMP target regions
+// are allowed to contain try/catch statements and throw expressions in Clang,
+// but if a throw expression is reached, it will terminate the program. That
+// does not conform to the C++ standard.
+//
+// [This document](https://eel.is/c++draft/algorithms.parallel) has been used as
+// reference for these considerations.
+
+#include <__algorithm/unwrap_iter.h>
+#include <__config>
+#include <__functional/operations.h>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/wrap_iter.h>
+#include <__pstl/backend_fwd.h>
+#include <__pstl/dispatch.h>
+#include <__type_traits/desugars_to.h>
+#include <__type_traits/is_arithmetic.h>
+#include <__type_traits/is_trivially_copyable.h>
+#include <__type_traits/remove_cvref.h>
+#include <__utility/empty.h>
+#include <__utility/forward.h>
+#include <__utility/move.h>
+#include <execution>
+#include <optional>
+
+#if !defined(_OPENMP)
+# error "Trying to use the OpenMP PSTL backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+#elif (defined(_OPENMP) && _OPENMP < 201511)
+# error \
+ "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+namespace __pstl {
+
+// The following functions can be used to map contiguous array sections to and from the device.
+// For now, they are simple overlays of the OpenMP pragmas, but they should be updated when adding
+// support for other iterator types.
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_to([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
+ static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target enter data map(to : __p[0 : __len])
+}
+
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_from([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
+ static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target exit data map(from : __p[0 : __len])
+}
+
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_alloc([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
+ static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target enter data map(alloc : __p[0 : __len])
+}
+
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_release([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
+ static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target exit data map(release : __p[0 : __len])
+}
+
+//
+// fill
+//
+template <class _Tp, class _DifferenceType, class _Up>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_fill(_Tp* __out1, _DifferenceType __n, const _Up& __value) noexcept {
+ __pstl::__omp_map_alloc(__out1, __n);
+#pragma omp target teams distribute parallel for
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ *(__out1 + __i) = __value;
+ __pstl::__omp_map_from(__out1, __n);
+ return __out1 + __n;
+}
+
+template <>
+struct __fill<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+ template <class _Policy, class _ForwardIterator, class _Tp>
+ [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty>
+ operator()(_Policy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Tp const& __value) const noexcept {
+ using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+ if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_trivially_copyable_v<_ValueType> &&
+ is_trivially_copyable_v<_Tp>) {
+ __pstl::__omp_fill(std::__unwrap_iter(__first), __last - __first, __value);
+ return __empty{};
+ } else {
+ using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+ using _Fallback = __dispatch<__pstl::__fill, _Backends, __remove_cvref_t<_Policy>>;
+ return _Fallback{}(std::forward<_Policy>(__policy), std::move(__first), std::move(__last), __value);
+ }
+ }
+};
+
+//
+// find_if
+//
+template <class _Tp, class _DifferenceType, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_find_if(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
+ __pstl::__omp_map_to(__first, __n);
+ _DifferenceType __idx = __n;
+#pragma omp target teams distribute parallel for reduction(min : __idx)
+ for (_DifferenceType __i = 0; __i < __n; ++__i) {
+ if (__pred(*(__first + __i))) {
+ __idx = (__i < __idx) ? __i : __idx;
+ }
+ }
+ __pstl::__omp_map_release(__first, __n);
+ return __first + __idx;
+}
+
+template <>
+struct __find_if<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+ template <class _Policy, class _ForwardIterator, class _Predicate>
+ _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
+ operator()(_Policy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) const noexcept {
+ using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+ if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_trivially_copyable_v<_ValueType>) {
+ return std::__rewrap_iter(__first, __pstl::__omp_find_if(std::__unwrap_iter(__first), __last - __first, __pred));
+ } else {
+ using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+ using _Fallback = __dispatch<__pstl::__find_if, _Backends, __remove_cvref_t<_Policy>>;
+ return _Fallback{}(std::forward<_Policy>(__policy), std::move(__first), std::move(__last), std::move(__pred));
+ }
+ }
+};
+
+//
+// for_each
+//
+template <class _Tp, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_for_each(_Tp* __inout1, _DifferenceType __n, _Function __f) noexcept {
+ __pstl::__omp_map_to(__inout1, __n);
+#pragma omp target teams distribute parallel for
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ __f(*(__inout1 + __i));
+ __pstl::__omp_map_from(__inout1, __n);
+ return __inout1 + __n;
+}
+
+template <>
+struct __for_each<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+ template <class _Policy, class _ForwardIterator, class _Functor>
+ _LIBCPP_HIDE_FROM_ABI optional<__empty>
+ operator()(_Policy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) const noexcept {
+ using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+ if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_trivially_copyable_v<_ValueType>) {
+ __pstl::__omp_for_each(std::__unwrap_iter(__first), __last - __first, std::move(__func));
+ return __empty{};
+ } else {
+ using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+ using _Fallback = __dispatch<__pstl::__for_each, _Backends, __remove_cvref_t<_Policy>>;
+ return _Fallback{}(std::forward<_Policy>(__policy), std::move(__first), std::move(__last), std::move(__func));
+ }
+ }
+};
+
+//
+// transform
+//
+template <class _Tp, class _DifferenceType, class _Up, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
+ // The order of the following maps matter, as we wish to move the data. If
+ // they were placed in the reverse order, and __in equals __out, then we would
+ // allocate the buffer on the device without copying the data.
+ __pstl::__omp_map_to(__in1, __n);
+ __pstl::__omp_map_alloc(__out1, __n);
+#pragma omp target teams distribute parallel for
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ *(__out1 + __i) = __f(*(__in1 + __i));
+ // The order of the following two maps matters, since the user could legally
+ // overwrite __in The "release" map modifier decreases the reference counter
+ // by one, and "from" only moves the data to the host, when the reference
+ // count is decremented to zero.
+ __pstl::__omp_map_release(__in1, __n);
+ __pstl::__omp_map_from(__out1, __n);
+ return __out1 + __n;
+}
+
+template <>
+struct __transform<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+ template <class _Policy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
+ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator>
+ operator()(_Policy&& __policy,
+ _ForwardIterator __first,
+ _ForwardIterator __last,
+ _ForwardOutIterator __outit,
+ _UnaryOperation __op) const noexcept {
+ using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+ if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value && is_trivially_copyable_v<_ValueType>) {
+ return std::__rewrap_iter(
+ __outit,
+ __omp_transform(
+ std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__outit), std::move(__op)));
+ } else {
+ using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+ using _Fallback = __dispatch<__pstl::__transform, _Backends, __remove_cvref_t<_Policy>>;
+ return _Fallback{}(
+ std::forward<_Policy>(__policy), std::move(__first), std::move(__last), std::move(__outit), std::move(__op));
+ }
+ }
+};
+
+//
+// transform_binary
+//
+template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp*
+__omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Function __f) noexcept {
+ // The order of the following maps matter, as we wish to move the data. If
+ // they were placed in the reverse order, and __out equals __in1 or __in2,
+ // then we would allocate one of the buffer on the device without copying the
+ // data.
+ __pstl::__omp_map_to(__in1, __n);
+ __pstl::__omp_map_to(__in2, __n);
+ __pstl::__omp_map_alloc(__out1, __n);
+#pragma omp target teams distribute parallel for
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ *(__out1 + __i) = __f(*(__in1 + __i), *(__in2 + __i));
+ // The order of the following three maps matters, since the user could legally
+ // overwrite either of the inputs if __out equals __in1 or __in2. The
+ // "release" map modifier decreases the reference counter by one, and "from"
+ // only moves the data from the device, when the reference count is
+ // decremented to zero.
+ __pstl::__omp_map_release(__in1, __n);
+ __pstl::__omp_map_release(__in2, __n);
+ __pstl::__omp_map_from(__out1, __n);
+ return __out1 + __n;
+}
+
+template <>
+struct __transform_binary<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+ template <class _Policy,
+ class _ForwardIterator1,
+ class _ForwardIterator2,
+ class _ForwardOutIterator,
+ class _BinaryOperation>
+ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator>
+ operator()(_Policy&& __policy,
+ _ForwardIterator1 __first1,
+ _ForwardIterator1 __last1,
+ _ForwardIterator2 __first2,
+ _ForwardOutIterator __outit,
+ _BinaryOperation __op) const noexcept {
+ using _ValueType1 = typename iterator_traits<_ForwardIterator1>::value_type;
+ using _ValueType2 = typename iterator_traits<_ForwardIterator2>::value_type;
+ if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value && is_trivially_copyable_v<_ValueType1> &&
+ is_trivially_copyable_v<_ValueType2>) {
+ return std::__rewrap_iter(
+ __outit,
+ __pstl::__omp_transform(
+ std::__unwrap_iter(__first1),
+ __last1 - __first1,
+ std::__unwrap_iter(__first2),
+ std::__unwrap_iter(__outit),
+ std::move(__op)));
+ } else {
+ using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+ using _Fallback = __dispatch<__pstl::__transform_binary, _Backends, __remove_cvref_t<_Policy>>;
+ return _Fallback{}(
+ std::forward<_Policy>(__policy),
+ std::move(__first1),
+ std::move(__last1),
+ std::move(__first2),
+ std::move(__outit),
+ std::move(__op));
+ }
+ }
+};
+
+//
+// transform_reduce
+//
+#define _LIBCPP_PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op) \
+ template <class _Iterator, \
+ class _DifferenceType, \
+ typename _Tp, \
+ typename _BinaryOperationType, \
+ typename _UnaryOperation> \
+ _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce( \
+ _Iterator __first, \
+ _DifferenceType __n, \
+ _Tp __init, \
+ std_op<_BinaryOperationType> __reduce, \
+ _UnaryOperation __transform) noexcept { \
+ __pstl::__omp_map_to(__first, __n); \
+_PSTL_PRAGMA(omp target teams distribute parallel for reduction(omp_op:__init)) \
+ for (_DifferenceType __i = 0; __i < __n; ++__i) \
+ __init = __reduce(__init, __transform(*(__first + __i))); \
+ __pstl::__omp_map_release(__first, __n); \
+ return __init; \
+ }
+
+#define _LIBCPP_PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op) \
+ template <class _Iterator1, \
+ class _Iterator2, \
+ class _DifferenceType, \
+ typename _Tp, \
+ typename _BinaryOperationType, \
+ typename _UnaryOperation > \
+ _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce( \
+ _Iterator1 __first1, \
+ _Iterator2 __first2, \
+ _DifferenceType __n, \
+ _Tp __init, \
+ std_op<_BinaryOperationType> __reduce, \
+ _UnaryOperation __transform) noexcept { \
+ __pstl::__omp_map_to(__first1, __n); \
+ __pstl::__omp_map_to(__first2, __n); \
+_PSTL_PRAGMA(omp target teams distribute parallel for reduction(omp_op:__init)) \
+ for (_DifferenceType __i = 0; __i < __n; ++__i) \
+ __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i))); \
+ __pstl::__omp_map_release(__first1, __n); \
+ __pstl::__omp_map_release(__first2, __n); \
+ return __init; \
+ }
+
+#define _LIBCPP_PSTL_OMP_SIMD_REDUCTION(omp_op, std_op) \
+ _LIBCPP_PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op) \
+ _LIBCPP_PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
+
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(+, std::plus)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(-, std::minus)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
+
+// Determine whether a reduction is supported by the OpenMP backend
+template <class _T1, class _T2, class _T3>
+struct __is_supported_reduction : std::false_type {};
+
+#define _LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(func) \
+ template <class _Tp> \
+ struct __is_supported_reduction<func<_Tp>, _Tp, _Tp> : true_type {}; \
+ template <class _Tp, class _Up> \
+ struct __is_supported_reduction<func<>, _Tp, _Up> : true_type {};
+
+// __is_trivial_plus_operation already exists
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::plus)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::minus)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::multiplies)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::logical_and)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::logical_or)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_and)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_or)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_xor)
+
+template <>
+struct __transform_reduce<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+ template <class _Policy, class _ForwardIterator, class _Tp, class _Reduction, class _Transform>
+ _LIBCPP_HIDE_FROM_ABI optional<_Tp>
+ operator()(_Policy&& __policy,
+ _ForwardIterator __first,
+ _ForwardIterator __last,
+ _Tp __init,
+ _Reduction __reduce,
+ _Transform __transform) const noexcept {
+ using _ValueType = typename iterator_traits<_ForwardIterator>::value_type;
+ if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+ __is_supported_reduction<_Reduction, _Tp, _Tp>::value && is_trivially_copyable_v<_ValueType>) {
+ return __pstl::__omp_transform_reduce(
+ std::__unwrap_iter(__first), __last - __first, __init, std::move(__reduce), std::move(__transform));
+ } else {
+ using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+ using _Fallback = __dispatch<__pstl::__transform_reduce, _Backends, __remove_cvref_t<_Policy>>;
+ return _Fallback{}(
+ std::forward<_Policy>(__policy),
+ std::move(__first),
+ std::move(__last),
+ std::move(__init),
+ std::move(__reduce),
+ std::move(__transform));
+ }
+ }
+};
+
+//
+// transform_reduce_binary
+//
+template <>
+struct __transform_reduce_binary<__openmp_backend_tag, execution::parallel_unsequenced_policy> {
+ template <class _Policy,
+ class _ForwardIterator1,
+ class _ForwardIterator2,
+ class _Tp,
+ class _Reduction,
+ class _Transform>
+ _LIBCPP_HIDE_FROM_ABI optional<_Tp> operator()(
+ _Policy&& __policy,
+ _ForwardIterator1 __first1,
+ _ForwardIterator1 __last1,
+ _ForwardIterator2 __first2,
+ _Tp __init,
+ _Reduction __reduce,
+ _Transform __transform) const noexcept {
+ using _ValueType1 = typename iterator_traits<_ForwardIterator1>::value_type;
+ using _ValueType2 = typename iterator_traits<_ForwardIterator2>::value_type;
+ if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+ __is_supported_reduction<_Reduction, _Tp, _Tp>::value && is_trivially_copyable_v<_ValueType1> &&
+ is_trivially_copyable_v<_ValueType2>) {
+ return __pstl::__omp_transform_reduce(
+ std::__unwrap_iter(__first1),
+ std::__unwrap_iter(__first2),
+ __last1 - __first1,
+ std::move(__init),
+ std::move(__reduce),
+ std::move(__transform));
+ } else {
+ using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>;
+ using _Fallback = __dispatch<__pstl::__transform_reduce_binary, _Backends, __remove_cvref_t<_Policy>>;
+ return _Fallback{}(
+ std::forward<_Policy>(__policy),
+ std::move(__first1),
+ std::move(__last1),
+ std::move(__first2),
+ std::move(__init),
+ std::move(__reduce),
+ std::move(__transform));
+ }
+ }
+};
+
+} // namespace __pstl
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___PSTL_BACKENDS_OPENMP_H
diff --git a/libcxx/include/__pstl/dispatch.h b/libcxx/include/__pstl/dispatch.h
index 5e903f7524fe9b..c984c22456120c 100644
--- a/libcxx/include/__pstl/dispatch.h
+++ b/libcxx/include/__pstl/dispatch.h
@@ -58,6 +58,21 @@ struct __find_first_implemented<_Algorithm, __backend_configuration<_B1, _Bn...>
template <template <class, class> class _Algorithm, class _BackendConfiguration, class _ExecutionPolicy>
using __dispatch = typename __find_first_implemented<_Algorithm, _BackendConfiguration, _ExecutionPolicy>::type;
+template <class _BackendConfiguration, class _Backend>
+struct __backends_after_impl;
+
+template <class _Backend, class... _RemainingBackends>
+struct __backends_after_impl<__backend_configuration<_Backend, _RemainingBackends...>, _Backend> {
+ using type = __backend_configuration<_RemainingBackends...>;
+};
+
+template <class _B1, class... _Bn, class _Backend>
+struct __backends_after_impl<__backend_configuration<_B1, _Bn...>, _Backend>
+ : __backends_after_impl<__backend_configuration<_Bn...>, _Backend> {};
+
+template <class _BackendConfiguration, class _Backend>
+using __backends_after = typename __backends_after_impl<_BackendConfiguration, _Backend>::type;
+
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 9ffccf66ff0948..5276a14f20cb70 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1600,6 +1600,10 @@ module std_private_pstl_backends_libdispatch [system] {
header "__pstl/backends/libdispatch.h"
export *
}
+module std_private_pstl_backends_openmp [system] {
+ header "__pstl/backends/openmp.h"
+ export *
+}
module std_private_pstl_backends_serial [system] {
header "__pstl/backends/serial.h"
export *
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
new file mode 100644
index 00000000000000..cdde2c3d0a9b9d
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but std::for_each(std::execution::par_unseq,...) is not executed on
+// the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(int, char**) {
+ // We only run the test if a device is detected by OpenMP
+ if (omp_get_num_devices() < 1)
+ return 0;
+
+ // Initializing test array
+ const int test_size = 10000;
+ std::vector<int> v(test_size, 2);
+
+ // By making an extra map, we can control when the data is mapped to and from
+ // the device, because the map inside std::fill will then only increment and
+ // decrement reference counters and not move data.
+ int* data = v.data();
+#pragma omp target enter data map(to : data[0 : v.size()])
+ std::fill(std::execution::par_unseq, v.begin(), v.end(), -2);
+
+ // At this point v should only contain the value 2
+ for (int vi : v)
+ assert(vi == 2 &&
+ "std::fill transferred data from device to the host but should only have decreased the reference counter.");
+
+// After moving the result back to the host it should now be -2
+#pragma omp target update from(data[0 : v.size()])
+ for (int vi : v)
+ assert(vi == -2 && "std::fill did not update the result on the device.");
+
+#pragma omp target exit data map(delete : data[0 : v.size()])
+
+ return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
new file mode 100644
index 00000000000000..7508d82156e54a
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that std::find_if(std::execution::par_unseq,...) always
+// finds the first entry in a vector matching the condition. If it was confused
+// with std::any_of, it could return the indexes in a non-increasing order.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <execution>
+#include <vector>
+
+template <class _Tp>
+void check_find_if(_Tp& data) {
+ const int len = data.end() - data.begin();
+ // Decrementing the values in the test indices
+ int idx[11] = {0, len / 10, len / 9, len / 8, len / 7, len / 6, len / 5, len / 4, len / 3, len / 2, len - 1};
+ for (auto i : idx) {
+ data[i] -= 1;
+ };
+
+ // Asserting that the minimas are found in the correct order
+ for (auto i : idx) {
+ auto found_min = std::find_if(
+ std::execution::par_unseq, data.begin(), data.end(), [&](decltype(data[0])& n) -> bool { return n < 2; });
+ assert(found_min == (data.begin() + i));
+ // Incrementing the minimum, so the next one can be found
+ (*found_min) += 1;
+ }
+}
+
+int main(int, char**) {
+ const int test_size = 10000;
+ // Testing with vector of doubles
+ {
+ std::vector<double> v(test_size, 2.0);
+ check_find_if(v);
+ }
+ // Testing with vector of integers
+ {
+ std::vector<int> v(test_size, 2);
+ check_find_if(v);
+ }
+ // Testing with array of doubles
+ {
+ std::array<double, test_size> a;
+ a.fill(2.0);
+ check_find_if(a);
+ }
+ // Testing with array of integers
+ {
+ std::array<int, test_size> a;
+ a.fill(2);
+ check_find_if(a);
+ }
+ return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
new file mode 100644
index 00000000000000..b84dd68ad3f1b3
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that we can provide function pointers as input to
+// std::find_if. The OpenMP declare target directive with the `indirect` clause
+// makes an implicit mapping of the host and device function pointers.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+bool is_odd(int& i) { return (i % 2) == 1; }
+#pragma omp declare target indirect to(is_odd)
+
+int main(int, char**) {
+ const int test_size = 10000;
+ std::vector<int> v(test_size, 2.0);
+ v[123] = 3;
+
+ // Providing for_each a function pointer
+ auto idx = std::find_if(std::execution::par_unseq, v.begin(), v.end(), is_odd);
+
+ assert(idx - v.begin() == 123 && "std::find_if(std::execution::par_unseq,...) does not accept function pointers");
+ return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
new file mode 100644
index 00000000000000..7afed4740a0f84
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but syd::find_if(std::execution::par_unseq,...) is not executed on
+// the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(int, char**) {
+ // We only run the test if a device is detected by OpenMP
+ if (omp_get_num_devices() < 1)
+ return 0;
+
+ // Initializing test array
+ const int test_size = 10000;
+ std::vector<double> v(test_size, 1);
+
+ auto idx = std::find_if(std::execution::par_unseq, v.begin(), v.end(), [](double&) -> bool {
+ // Returns true if executed on the host
+ return omp_is_initial_device();
+ });
+ assert(idx == v.end() &&
+ "omp_is_initial_device() returned true in the target region. std::find_if was not offloaded.");
+ return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
new file mode 100644
index 00000000000000..4b9c23860800c8
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that we can provide function pointers as input to
+// std::for_each. The OpenMP declare target directive with the `indirect` clause
+// makes an implicit mapping of the host and device function pointers.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+void cube(double& d) { d *= d * d; }
+#pragma omp declare target indirect to(cube)
+
+int main(int, char**) {
+ const int test_size = 10000;
+ std::vector<double> v(test_size, 2.0);
+
+ // Providing for_each a function pointer
+ std::for_each(std::execution::par_unseq, v.begin(), v.end(), cube);
+
+ for (int vi : v)
+ assert(vi == 8 && "std::for_each(std::execution::par_unseq,...) does not accept function pointers");
+ return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
new file mode 100644
index 00000000000000..e0b0f18df27268
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that we can provide a lambda as input to std::for_each in
+// different ways.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+template <class Function, class Tp>
+void test_lambda(Function fun, Tp initial_value, Tp final_value) {
+ const int test_size = 10000;
+ std::vector<double> v(test_size, initial_value);
+
+ // Providing for_each a function pointer
+ std::for_each(std::execution::par_unseq, v.begin(), v.end(), fun);
+
+ for (int vi : v)
+ assert(vi == final_value && "std::for_each(std::execution::par_unseq,...) does not accept lambdas");
+}
+
+int main(int, char**) {
+ // Capturing by reference
+ auto cube_ref = [&](double& a) { a *= a * a; };
+ test_lambda(cube_ref, 2.0, 8.0);
+
+ // Capturing by value
+ auto cube_val = [=](double& a) { a *= a * a; };
+ test_lambda(cube_val, 2.0, 8.0);
+
+ // Capturing by reference when using additional input
+ double c1 = 1.0;
+ auto cube_ref_2 = [&](double& a) { a = a * a * a + c1; };
+#pragma omp target data map(to : c1)
+ test_lambda(cube_ref_2, 2.0, 9.0);
+ return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
new file mode 100644
index 00000000000000..4021e7b18e5f43
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but for_each(std::execution::par_unseq,...) is not executed on the
+// device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(int, char**) {
+ // We only run the test if a device is detected by OpenMP
+ if (omp_get_num_devices() < 1)
+ return 0;
+
+ // Initializing test array
+ const int test_size = 10000;
+ std::vector<int> v(test_size);
+ std::for_each(std::execution::par_unseq, v.begin(), v.end(), [](int& n) {
+ // Returns true if executed on the host
+ n = omp_is_initial_device();
+ });
+
+ for (int vi : v)
+ assert(vi == 0 && "omp_is_initial_device() returned true in the target region. std::for_each was not offloaded.");
+ return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
new file mode 100644
index 00000000000000..74f177b75e69cb
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that you can overwrite the input in
+// std::for_each(std::execution::par_unseq,...). If the result was not copied
+// back from the device to the host, this test would fail.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <execution>
+#include <vector>
+
+template <class _Tp, class _Predicate, class _Up>
+void overwrite(_Tp& data, _Predicate pred, const _Up& value) {
+ // This function assumes that pred will never be the identity transformation
+
+ // Updating the array with a lambda
+ std::for_each(std::execution::par_unseq, data.begin(), data.end(), pred);
+
+ // Asserting that no elements have the intial value
+ for (int di : data)
+ assert(
+ di != value &&
+ "The GPU implementation of std::for_each does not allow users to mutate the input as the C++ standard does.");
+}
+
+int main(int, char**) {
+ const double value = 2.0;
+ const int test_size = 10000;
+ // Testing with vector of doubles
+ {
+ std::vector<double> v(test_size, value);
+ overwrite(v, [&](double& n) { n *= n; }, value);
+ }
+ // Testing with vector of integers
+ {
+ std::vector<int> v(test_size, (int)value);
+ overwrite(v, [&](int& n) { n *= n; }, (int)value);
+ }
+ // Testing with array of doubles
+ {
+ std::array<double, test_size> a;
+ a.fill(value);
+ overwrite(a, [&](double& n) { n *= n; }, value);
+ }
+ // Testing with array of integers
+ {
+ std::array<int, test_size> a;
+ a.fill((int)value);
+ overwrite(a, [&](int& n) { n *= n; }, (int)value);
+ }
+ return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
new file mode 100644
index 00000000000000..801693a3d1eb84
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that the libc++ test configuration forwards the AMD and
+// NVIDIA environment variables specifying the visible devices. Intially when
+// developing the OpenMP offloading tests, this was not the case, and this test
+// will reveal if the configuration is wrong another time.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <string>
+#include <cassert>
+#include <omp.h>
+#include <iostream>
+
+std::string get_env_var(std::string const& env_var_name, int& flag) {
+ char* val;
+ val = getenv(env_var_name.c_str());
+ std::string retval = "";
+ flag = (val != NULL);
+ return (val != NULL) ? val : "";
+}
+
+int main(int, char**) {
+ // Stores whether the environment variable was found
+ int status = 0;
+
+ // Checking for AMD's enviroment variable for specifying visible devices
+ std::string rocr_visible_devices = get_env_var("ROCR_VISIBLE_DEVICES", status);
+ if (status)
+ assert(
+ (rocr_visible_devices.empty() || (omp_get_num_devices() > 0)) &&
+ "ROCR_VISIBLE_DEVICES was set but no devices were detected by OpenMP. The libc++ test suite is misconfigured.");
+
+ // Checking for NVIDIA's enviroment variable for specifying visible devices
+ std::string cuda_visible_devices = get_env_var("CUDA_VISIBLE_DEVICES", status);
+ if (status)
+ assert(
+ (cuda_visible_devices.empty() || (omp_get_num_devices() > 0)) &&
+ "CUDA_VISIBLE_DEVICES was set but no devices were detected by OpenMP. The libc++ test suite is misconfigured.");
+ return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp
new file mode 100644
index 00000000000000..85b2656788cf35
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that a diagnostic error is prompted if the OpenMP version is below
+// the minimum required version.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=40
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+
+// expected-error at __algorithm/pstl_backends/openmp/backend.h:26 {{"OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."}}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp
new file mode 100644
index 00000000000000..4765ca0d540b34
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that one can include algorithm without any diagnostics when using
+// the minimum required version of OpenMP.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=45
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+
+// expected-no-diagnostics
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp
new file mode 100644
index 00000000000000..b7836cb9425486
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that one can include algorithm without any diagnostics when using a
+// version that is newer than the minimum requirement.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=51
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+
+// expected-no-diagnostics
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
new file mode 100644
index 00000000000000..f660bc23bbbf75
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but std::transform(std::execution::par_unseq,...) is not executed
+// on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(int, char**) {
+ // We only run the test if a device is detected by OpenMP
+ if (omp_get_num_devices() < 1)
+ return 0;
+
+ // Initializing test arrays
+ const int test_size = 10000;
+ std::vector<int> host(test_size);
+ std::vector<int> device(test_size);
+ // Should execute on host
+ std::transform(std::execution::unseq, host.begin(), host.end(), host.begin(), [](int& h) {
+ // Returns true if executed on the host
+ h = omp_is_initial_device();
+ return h;
+ });
+
+ // Asserting the std::transform(std::execution::unseq,...) executed on the host
+ for (int hi : host)
+ assert(hi && "omp_is_initial_device() returned false. std::transform was offloaded but shouldn't be.");
+
+ // Should execute on device
+ std::transform(
+ std::execution::par_unseq, device.begin(), device.end(), host.begin(), device.begin(), [](int& d, int& h) {
+ // Should return fals
+ d = omp_is_initial_device();
+ return h == d;
+ });
+
+ // Asserting the std::transform(std::execution::par_unseq,...) executed on the device
+ for (int di : device)
+ assert(!di && "omp_is_initial_device() returned true in the target region. std::transform was not offloaded.");
+ return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
new file mode 100644
index 00000000000000..bf059fadf70633
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but std::transform_reduce(std::execution::par_unseq,...) is not
+// executed on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <functional>
+#include <vector>
+#include <omp.h>
+
+int main(int, char**) {
+ // We only run the test if a device is detected by OpenMP
+ if (omp_get_num_devices() < 1)
+ return 0;
+
+ // Initializing test array
+ const int test_size = 10000;
+ std::vector<int> v(test_size, 1);
+ std::vector<int> w(test_size, 1);
+
+ int result = std::transform_reduce(
+ std::execution::par_unseq, v.begin(), v.end(), w.begin(), (int)0, std::plus{}, [](int& n, int& m) {
+ return n + m + omp_is_initial_device(); // Gives 2 if executed on device, 3 if executed on host
+ });
+ assert(result == 2 * test_size &&
+ "omp_is_initial_device() returned true in the target region. std::transform_reduce was not offloaded.");
+ return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
new file mode 100644
index 00000000000000..fb324d6db69db1
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
@@ -0,0 +1,199 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that std::transform_reduce(std::execution::par_unseq,...)
+// can be offloaded for a number of supported binary operations. The following
+// binary operations should be supported for the reducer:
+// - std::plus
+// - std::minus
+// - std::multiplies
+// - std::logical_and
+// - std::logical_or
+// - std::bit_and
+// - std::bit_or
+// - std::bit_xor
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// REQUIRES: libcpp-pstl-backend-openmp
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <execution>
+#include <functional>
+#include <vector>
+#include <omp.h>
+#include <iostream>
+
+int main(int, char**) {
+ // We only run the test if a device is detected by OpenMP
+ if (omp_get_num_devices() < 1)
+ return 0;
+
+ // Initializing test array
+ const int test_size = 10000;
+
+ //===--------------------------------------------------------------------===//
+ // Arithmetic binary operators
+ //===--------------------------------------------------------------------===//
+
+ // Addition with doubles
+ {
+ std::vector<double> v(test_size, 1.0);
+ std::vector<double> w(test_size, 2.0);
+ double result = std::transform_reduce(
+ std::execution::par_unseq, v.begin(), v.end(), w.begin(), 5.0, std::plus{}, [](double& a, double& b) {
+ return 0.5 * (b - a) * ((double)!omp_is_initial_device());
+ });
+ assert((std::abs(result - 0.5 * ((double)test_size) - 5.0) < 1e-8) &&
+ "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+ "operation std::plus.");
+ }
+
+ // Subtraction of floats
+ {
+ std::vector<float> v(test_size, 1.0f);
+ std::vector<float> w(test_size, 1.5f);
+ float result = std::transform_reduce(
+ std::execution::par_unseq,
+ v.begin(),
+ v.end(),
+ w.begin(),
+ 1.25 * ((float)test_size),
+ std::minus{},
+ [](float& a, float& b) { return 0.5 * (a + b) * ((float)!omp_is_initial_device()); });
+ assert((std::abs(result) < 1e-8f) &&
+ "std::transform_reduce(std::execution::par_unseq,...) does not have the "
+ "intended effect for the binary operation std::minus.");
+ }
+
+ // Multiplication of doubles
+ {
+ std::vector<double> v(test_size, 1.0);
+ std::vector<double> w(test_size, 0.0001);
+ double result = std::transform_reduce(
+ std::execution::par_unseq, v.begin(), v.end(), w.begin(), -1.0, std::multiplies{}, [](double& a, double& b) {
+ return (a + b) * ((double)!omp_is_initial_device());
+ });
+ assert((std::abs(result + pow(1.0001, test_size)) < 1e-8) &&
+ "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+ "operation std::multiplies.");
+ }
+
+ //===--------------------------------------------------------------------===//
+ // Logical binary operators
+ //===--------------------------------------------------------------------===//
+
+ // Logical and
+ {
+ std::vector<int> v(test_size, 1);
+ // The result should be true with an initial value of 1
+ int result =
+ std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 1, std::logical_and{}, [](int& a) {
+ return a && !omp_is_initial_device();
+ });
+ assert(result &&
+ "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+ "operation std::logical_and.");
+
+ // And false by an initial value of 0
+ result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::logical_and{}, [](int& a) {
+ return a && !omp_is_initial_device();
+ });
+ assert(!result &&
+ "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+ "operation std::logical_and.");
+ }
+
+ // Logical or
+ {
+ std::vector<int> v(test_size, 0);
+ // The result should be true with an initial value of 1
+ int result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 1, std::logical_or{}, [](int& a) {
+ return a && !omp_is_initial_device();
+ });
+ assert(result &&
+ "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+ "operation std::logical_or.");
+
+ // And false by an initial value of 0
+ result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::logical_or{}, [](int& a) {
+ return a && !omp_is_initial_device();
+ });
+ assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+ "binary operation std::logical_or.");
+ }
+
+ //===--------------------------------------------------------------------===//
+ // Bitwise binary operators
+ //===--------------------------------------------------------------------===//
+
+ // Bitwise and
+ {
+ std::vector<unsigned int> v(test_size, 3);
+ std::vector<unsigned int> w(test_size, 2);
+ // For odd numbers the result should be true
+ int result =
+ std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0x1, std::bit_and{}, [](unsigned int& a) {
+ return a + omp_is_initial_device();
+ });
+ assert(result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+ "binary operation std::bit_and.");
+
+ // For even numbers the result should be false
+ result =
+ std::transform_reduce(std::execution::par_unseq, w.begin(), w.end(), 0x1, std::bit_and{}, [](unsigned int& a) {
+ return a + omp_is_initial_device();
+ });
+ assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+ "binary operation std::bit_and.");
+ }
+
+ // Bitwise or
+ {
+ std::vector<unsigned int> v(test_size, 0);
+ int result = std::transform_reduce(
+ std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) -> unsigned int {
+ return a || omp_is_initial_device();
+ });
+ assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+ "binary operation std::bit_or.");
+
+ // After adding a one, the result should be true
+ v[v.size() / 2] = 1;
+ result = std::transform_reduce(
+ std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) -> unsigned int {
+ return a && !omp_is_initial_device();
+ });
+ assert(result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+ "binary operation std::bit_or.");
+ }
+
+ // Bitwise xor
+ {
+ std::vector<unsigned int> v(test_size, 0xef);
+ int result =
+ std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_xor{}, [](unsigned int& a) {
+ return a << omp_is_initial_device();
+ });
+ assert(result == 0 && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for "
+ "the binary operation std::bit_or.");
+
+ // After adding a one, the result should be true
+ v[v.size() / 2] = 0xea;
+ result =
+ std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_xor{}, [](unsigned int& a) {
+ return a << omp_is_initial_device();
+ });
+ assert(result == 5 && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for "
+ "the binary operation std::bit_or.");
+ }
+
+ return 0;
+}
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index e40c2b635ef908..842ff584ddc15d 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -130,6 +130,26 @@ function generate-cmake() {
"${@}"
}
+function generate-cmake-openmp() {
+ echo "--- Generating CMake"
+ ${CMAKE} \
+ -S "${MONOREPO_ROOT}/llvm" \
+ -B "${BUILD_DIR}" \
+ -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \
+ -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+ -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
+ -DLIBCXX_ENABLE_WERROR=YES \
+ -DLIBCXXABI_ENABLE_WERROR=YES \
+ -DLIBUNWIND_ENABLE_WERROR=YES \
+ -DLIBCXX_ENABLE_CLANG_TIDY=${ENABLE_CLANG_TIDY} \
+ -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \
+ -DLLVM_ENABLE_PROJECTS="clang;openmp" \
+ -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind;offload" \
+ -DLIBCXX_PSTL_BACKEND="openmp" \
+ -DLIBCXX_CXX_ABI=libcxxabi \
+ "${@}"
+}
+
function generate-cmake-libcxx-win() {
generate-cmake-base \
-DLLVM_ENABLE_RUNTIMES="libcxx" \
@@ -514,6 +534,18 @@ generic-optimized-speed)
generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-optimized-speed.cmake"
check-runtimes
;;
+generic-pstl-openmp)
+ clean
+ # TODO: Pass different host and device triples to the test configuration.
+ # For now, the OpenMP buildbot offloads to the host. To set the correct host target triple,
+ # we pass it via the LIBCXX test parameters.
+ PARAMS="target_triple=$(${CXX} --print-target-triple)"
+ generate-cmake-openmp -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-pstl-openmp.cmake" \
+ ${MONOREPO_ROOT}/llvm -DLIBCXX_TEST_PARAMS="${PARAMS}"
+ echo "+++ Installing OpenMP and Clang"
+ ${NINJA} -vC "${BUILD_DIR}" install
+ check-runtimes
+;;
apple-system)
clean
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 7a9631a56e4bbd..f516da9c552bc4 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -359,6 +359,7 @@ def _mingwSupportsModules(cfg):
"_LIBCPP_HAS_NO_TIME_ZONE_DATABASE": "no-tzdb",
"_LIBCPP_HAS_NO_UNICODE": "libcpp-has-no-unicode",
"_LIBCPP_PSTL_BACKEND_LIBDISPATCH": "libcpp-pstl-backend-libdispatch",
+ "_LIBCPP_PSTL_BACKEND_OPENMP": "openmp_pstl_backend",
}
for macro, feature in macros.items():
DEFAULT_FEATURES.append(
@@ -368,6 +369,27 @@ def _mingwSupportsModules(cfg):
)
)
+DEFAULT_FEATURES.append(
+ Feature(
+ name="libcpp-pstl-backend-openmp",
+ when=lambda cfg: "_LIBCPP_PSTL_BACKEND_OPENMP" in compilerMacros(cfg),
+ actions=[
+ AddFlagIfSupported("-fopenmp"),
+ # The linker needs to find the correct version of libomptarget
+ AddLinkFlag("-Wl,-rpath,%{lib-dir}"),
+ # The preprocessor needs to find the omp.h header. If OpenMP was
+ # installed as a project, the header lives in the following
+ # directory
+ AddFlag("-I %{lib-dir}/../../projects/openmp/runtime/src/"),
+ # And if it was installed as a runtime it lives in the following:
+ AddFlag("-I %{lib-dir}/../../runtimes/runtimes-bins/openmp/runtime/src"),
+ # TODO: Add conditional to test if a GPU target has been detected.
+ # For now, we only offload to the host in this test configuration.
+ # If a GPU were present, we should instead pass --offload-arch=native.
+ AddFlagIfSupported("-fopenmp-targets=%{triple}"),
+ ],
+ )
+)
# Mapping from canonical locale names (used in the tests) to possible locale
# names on various systems. Each locale is considered supported if any of the
diff --git a/libcxx/utils/run.py b/libcxx/utils/run.py
index 6b4d615444bcfa..427e96b91fa370 100755
--- a/libcxx/utils/run.py
+++ b/libcxx/utils/run.py
@@ -64,6 +64,21 @@ def main():
if "TEMP" in os.environ:
env["TEMP"] = os.environ.get("TEMP")
+ # Forwarding the environment variable CUDA_VISIBLE_DEVICES which configures
+ # the visible NVIDIA GPUs.
+ if "CUDA_VISIBLE_DEVICES" in os.environ:
+ env["CUDA_VISIBLE_DEVICES"] = os.environ["CUDA_VISIBLE_DEVICES"]
+
+ # Forwarding the environment variable ROCR_VISIBLE_DEVICES which configures
+ # the visible AMD GPUs.
+ if "ROCR_VISIBLE_DEVICES" in os.environ:
+ env["ROCR_VISIBLE_DEVICES"] = os.environ["ROCR_VISIBLE_DEVICES"]
+
+ # Pass the OpenMP debug flag. Can be used to print information about the
+ # GPU execution of the tests.
+ if "LIBOMPTARGET_DEBUG" in os.environ:
+ env["LIBOMPTARGET_DEBUG"] = os.environ["LIBOMPTARGET_DEBUG"]
+
# Run the command line with the given environment in the execution directory.
return subprocess.call(commandLine, cwd=args.execdir, env=env, shell=False)
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index 52ba52f3439fb1..01fa382ab93f07 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -414,6 +414,14 @@ if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
add_definitions("-D_XOPEN_SOURCE=700")
endif()
+# If the OpenMP PSTL backend has been enabled for libcxx, OpenMP must be
+# enabled during compilation
+if (DEFINED LIBCXX_PSTL_BACKEND)
+ if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+ add_compile_options(-fopenmp)
+ endif()
+endif()
+
#===============================================================================
# Setup Source Code
#===============================================================================
>From d83e0de675802ab1bfdde0e9ad712106015621ba Mon Sep 17 00:00:00 2001
From: Vedant Tewari <vedanttewari541 at gmail.com>
Date: Mon, 12 Aug 2024 01:23:46 -0500
Subject: [PATCH 2/2] Algorithms for ranges shift left and shift right
---
.../include/__algorithm/ranges_shift_left.h | 74 ++++++++++++++++++
.../include/__algorithm/ranges_shift_right.h | 75 +++++++++++++++++++
2 files changed, 149 insertions(+)
create mode 100644 libcxx/include/__algorithm/ranges_shift_left.h
create mode 100644 libcxx/include/__algorithm/ranges_shift_right.h
diff --git a/libcxx/include/__algorithm/ranges_shift_left.h b/libcxx/include/__algorithm/ranges_shift_left.h
new file mode 100644
index 00000000000000..5f2d47c7805d55
--- /dev/null
+++ b/libcxx/include/__algorithm/ranges_shift_left.h
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_RANGES_SHIFT_LEFT_H
+#define _LIBCPP___ALGORITHM_RANGES_SHIFT_LEFT_H
+
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__ranges/access.h>
+#include <__ranges/concepts.h>
+#include <__ranges/subrange.h>
+#include <__utility/move.h>
+#include <__utility/forward.h>
+#include <__iterator/advance.h>
+#include <__iterator/distance.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace ranges {
+
+namespace __shift_left {
+
+struct __fn {
+ template <std::permutable _Iter, std::sentinel_for<_Iter> _Sent>
+ _LIBCPP_HIDE_FROM_ABI constexpr subrange<_Iter>
+ operator()(_Iter __first, _Sent __last, iter_difference_t<_Iter> __n) const {
+ if (__n <= 0) {
+ return {__first, __first};
+ }
+
+ auto __dist = std::ranges::distance(__first, __last);
+ if (__n >= __dist) {
+ return {__first, __first};
+ }
+
+ auto __mid = std::ranges::next(__first, __n);
+ auto __new_last = std::move(__mid, __last, __first);
+ return {__first, __new_last};
+ }
+
+ template <std::ranges::forward_range _Range>
+ requires std::permutable<iterator_t<_Range>>
+ _LIBCPP_HIDE_FROM_ABI constexpr borrowed_subrange_t<_Range>
+ operator()(_Range&& __range, range_difference_t<_Range> __n) const {
+ return (*this)(std::ranges::begin(__range), std::ranges::end(__range), __n);
+ }
+};
+
+} // namespace __shift_left
+
+inline namespace __cpo {
+ inline constexpr auto shift_left = __shift_left::__fn{};
+} // namespace __cpo
+
+} // namespace ranges
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___ALGORITHM_RANGES_SHIFT_LEFT_H
diff --git a/libcxx/include/__algorithm/ranges_shift_right.h b/libcxx/include/__algorithm/ranges_shift_right.h
new file mode 100644
index 00000000000000..de4405446ba86f
--- /dev/null
+++ b/libcxx/include/__algorithm/ranges_shift_right.h
@@ -0,0 +1,75 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_RANGES_SHIFT_RIGHT_H
+#define _LIBCPP___ALGORITHM_RANGES_SHIFT_RIGHT_H
+
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__ranges/access.h>
+#include <__ranges/concepts.h>
+#include <__ranges/subrange.h>
+#include <__utility/move.h>
+#include <__utility/forward.h>
+#include <__iterator/advance.h>
+#include <__iterator/distance.h>
+#include <algorithm>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace ranges {
+
+namespace __shift_right {
+
+struct __fn {
+ template <std::permutable _Iter, std::sentinel_for<_Iter> _Sent>
+ _LIBCPP_HIDE_FROM_ABI constexpr subrange<_Iter>
+ operator()(_Iter __first, _Sent __last, iter_difference_t<_Iter> __n) const {
+ if (__n <= 0) {
+ return {__last, __last};
+ }
+
+ auto __dist = std::ranges::distance(__first, __last);
+ if (__n >= __dist) {
+ return {__last, __last};
+ }
+
+ auto __new_first = std::ranges::next(__first, __dist - __n);
+ std::move_backward(__first, __new_first, __last);
+ return {__new_first, __last};
+ }
+
+ template <std::ranges::forward_range _Range>
+ requires std::permutable<iterator_t<_Range>>
+ _LIBCPP_HIDE_FROM_ABI constexpr borrowed_subrange_t<_Range>
+ operator()(_Range&& __range, range_difference_t<_Range> __n) const {
+ return (*this)(std::ranges::begin(__range), std::ranges::end(__range), __n);
+ }
+};
+
+} // namespace __shift_right
+
+inline namespace __cpo {
+ inline constexpr auto shift_right = __shift_right::__fn{};
+} // namespace __cpo
+
+} // namespace ranges
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___ALGORITHM_RANGES_SHIFT_RIGHT_H
More information about the libcxx-commits
mailing list