[libcxx] [libcxxabi] [llvm] Adding Separate OpenMP Offloading Backend to `libcxx/include/__algorithm/pstl_backends` (PR #66968)

Wed Mar 27 12:09:56 PDT 2024

https://github.com/ldionne updated https://github.com/llvm/llvm-project/pull/66968

>From f219812cc96c8d1a094b2d90589641afa25ab45e Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 20 Sep 2023 17:06:10 -0700
Subject: [PATCH 01/65] Adding OpenMP Offloading Backend for C++ Parallel
 Algorithms

---
 libcxx/CMakeLists.txt                         | 14 +++
 libcxx/include/CMakeLists.txt                 |  5 +
 libcxx/include/__algorithm/pstl_backend.h     |  8 ++
 .../__algorithm/pstl_backends/gpu_backend.h   | 21 +++++
 .../pstl_backends/gpu_backends/backend.h      | 33 +++++++
 .../pstl_backends/gpu_backends/fill.h         | 59 ++++++++++++
 .../pstl_backends/gpu_backends/for_each.h     | 59 ++++++++++++
 .../pstl_backends/gpu_backends/omp_offload.h  | 91 +++++++++++++++++++
 libcxx/include/__config_site.in               |  1 +
 9 files changed, 291 insertions(+)
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backend.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index de7fa8e3be31a8..340666da76cfc0 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -294,6 +294,8 @@ option(LIBCXX_HAS_WIN32_THREAD_API "Ignore auto-detection and force use of win32
 option(LIBCXX_HAS_EXTERNAL_THREAD_API
   "Build libc++ with an externalized threading API.
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
+option(LIBCXX_ENABLE_GPU_OFFLOAD 
+  "Build libc++ with support for GPU offload" OFF)
 
 if (LIBCXX_ENABLE_THREADS)
   set(LIBCXX_PSTL_CPU_BACKEND "std_thread" CACHE STRING "Which PSTL CPU backend to use")
@@ -301,6 +303,14 @@ else()
   set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
 endif()
 
+if (NOT DEFINED LIBCXX_PSTL_GPU_BACKEND)
+  if (${LIBCXX_ENABLE_GPU_OFFLOAD})
+    set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
+  else()
+    set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
+  endif()
+endif()
+
 # Misc options ----------------------------------------------------------------
 # FIXME: Turn -pedantic back ON. It is currently off because it warns
 # about #include_next which is used everywhere.
@@ -782,6 +792,10 @@ else()
                        Valid backends are: serial, std_thread and libdispatch")
 endif()
 
+if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload")
+  config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+endif()
+
 if (LIBCXX_ABI_DEFINES)
   set(abi_defines)
   foreach (abi_define ${LIBCXX_ABI_DEFINES})
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 5e8c1700ee4cda..7b40905144e336 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -85,6 +85,11 @@ set(files
   __algorithm/pstl_backends/cpu_backends/thread.h
   __algorithm/pstl_backends/cpu_backends/transform.h
   __algorithm/pstl_backends/cpu_backends/transform_reduce.h
+  __algorithm/pstl_backends/gpu_backend.h
+  __algorithm/pstl_backends/gpu_backends/backend.h
+  __algorithm/pstl_backends/gpu_backends/fill.h
+  __algorithm/pstl_backends/gpu_backends/for_each.h
+  __algorithm/pstl_backends/gpu_backends/omp_offload.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index dcb1c99e7962e6..aeba1cdf8dd9f0 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKEND_H
 
 #include <__algorithm/pstl_backends/cpu_backend.h>
+#include <__algorithm/pstl_backends/gpu_backend.h>
 #include <__config>
 #include <execution>
 
@@ -210,10 +211,17 @@ struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
+#   if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+template <>
+struct __select_backend<std::execution::parallel_unsequenced_policy> {
+  using type = __gpu_backend_tag;
+};
+#   else
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __cpu_backend_tag;
 };
+#   endif
 
 #  else
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
new file mode 100644
index 00000000000000..46a85f77b5deb9
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+
+#include <__config>
+
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+
+#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#include <__algorithm/pstl_backends/gpu_backends/fill.h>
+#include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#endif
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
new file mode 100644
index 00000000000000..a8b400afbb94d9
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+
+#include <__config>
+#include <cstddef>
+
+#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#  include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+struct __gpu_backend_tag {};
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
new file mode 100644
index 00000000000000..5603e18a5d2d3f
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+
+#include <__algorithm/fill.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+#include <stdio.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
+  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // parallel unsequenced, as it is the only execution policy prohibiting throwing
+  // exceptions and allowing SIMD instructions
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+  }
+  // Else if the excution policy is parallel, we execute for_each on the CPU instead
+  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__terminate_on_exception([&] {
+      __par_backend::__parallel_for(
+          __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+            std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{}, __brick_first, __brick_last, __value);
+          });
+    });
+  // Else we execute for_each in serial
+  } else {
+    std::fill(__first, __last, __value);
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
new file mode 100644
index 00000000000000..20486d83863f42
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+
+#include <__algorithm/for_each.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+#include <stdio.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
+  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // parallel unsequenced, as it is the only execution policy prohibiting throwing
+  // exceptions and allowing SIMD instructions
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+  }
+  // Else if the excution policy is parallel, we execute for_each on the CPU instead
+  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__terminate_on_exception([&] {
+      std::__par_backend::__parallel_for(
+          __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+            std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{}, __brick_first, __brick_last, __func);
+          });
+    });
+  // Else we execute for_each in serial
+  } else {
+    std::for_each(__first, __last, __func);
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
new file mode 100644
index 00000000000000..840118dbec5057
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -0,0 +1,91 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+
+#include <__assert>
+#include <__config>
+#include <__utility/move.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+// In OpenMP, we need to extract the pointer for the underlying data for data
+// structures like std::vector and std::array to be able to map the data to the
+// device.
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(T p) {
+  return p;
+}
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
+  std::pointer_traits<std::__wrap_iter<T>> PT;
+  return PT.to_address(w);
+}
+
+// Applying function or lambda in a loop
+
+template <class _Iterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n])
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__first[__i]);
+
+  return __first + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+  __omp_parallel_for_simd_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
+  return __first + __n;
+}
+
+// Assigning a value in a loop
+
+template <class _Index, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n]) map(to:__value)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __first[__i] = __value;
+
+  return __first + __n;
+}
+
+template <class _Index, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI _Index __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+  __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
+  return __first + __n;
+}
+
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index c85cbcd02c441b..e0edddce3afc3f 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -34,6 +34,7 @@
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_SERIAL
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
+#cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD
 
 // Hardening.
 #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT

>From cefbecc65e665afa81e6b1a25d93cf9ab624f830 Mon Sep 17 00:00:00 2001
From: antonrydahl <rydahl2610 at gmail.com>
Date: Wed, 20 Sep 2023 17:48:25 -0700
Subject: [PATCH 02/65] Clang formatting OpenMP backend for parallel algorithms

---
 libcxx/include/__algorithm/pstl_backend.h           |  6 +++---
 .../include/__algorithm/pstl_backends/gpu_backend.h |  4 ++--
 .../__algorithm/pstl_backends/gpu_backends/fill.h   | 12 ++++++------
 .../pstl_backends/gpu_backends/for_each.h           | 12 ++++++------
 .../pstl_backends/gpu_backends/omp_offload.h        | 13 ++++++++-----
 5 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index aeba1cdf8dd9f0..06e0790a283d55 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -211,17 +211,17 @@ struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
-#   if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#    if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __gpu_backend_tag;
 };
-#   else
+#    else
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __cpu_backend_tag;
 };
-#   endif
+#    endif
 
 #  else
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index 46a85f77b5deb9..7237036156a1bf 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -14,8 +14,8 @@
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 
 #if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
-#include <__algorithm/pstl_backends/gpu_backends/fill.h>
-#include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#  include <__algorithm/pstl_backends/gpu_backends/fill.h>
+#  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
 #endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index 5603e18a5d2d3f..32926da87e2a08 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
 
 #include <__algorithm/fill.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
@@ -29,16 +29,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
-  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // It is only safe to execute for_each on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
   }
   // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__terminate_on_exception([&] {
       __par_backend::__parallel_for(
           __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
@@ -46,7 +46,7 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
                 __cpu_backend_tag{}, __brick_first, __brick_last, __value);
           });
     });
-  // Else we execute for_each in serial
+    // Else we execute for_each in serial
   } else {
     std::fill(__first, __last, __value);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index 20486d83863f42..14de2af8e4a15c 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
 
 #include <__algorithm/for_each.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
@@ -29,16 +29,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
-  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // It is only safe to execute for_each on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
   // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__terminate_on_exception([&] {
       std::__par_backend::__parallel_for(
           __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
@@ -46,7 +46,7 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
                 __cpu_backend_tag{}, __brick_first, __brick_last, __func);
           });
     });
-  // Else we execute for_each in serial
+    // Else we execute for_each in serial
   } else {
     std::for_each(__first, __last, __func);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 840118dbec5057..4baa4e7f65859d 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -46,8 +46,9 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
 // Applying function or lambda in a loop
 
 template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n])
+_LIBCPP_HIDE_FROM_ABI _Iterator
+__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n])
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
 
@@ -65,8 +66,9 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
 // Assigning a value in a loop
 
 template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
-  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n]) map(to:__value)
+_LIBCPP_HIDE_FROM_ABI _Index
+__omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) map(to : __value)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __first[__i] = __value;
 
@@ -74,7 +76,8 @@ _LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _Diff
 }
 
 template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+_LIBCPP_HIDE_FROM_ABI _Index
+__parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
   __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
   return __first + __n;
 }

>From 5b6d4e48bb7408e360420327e62f4943a7c8d139 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 21 Sep 2023 12:50:17 -0700
Subject: [PATCH 03/65] Making PSTL GPU backend depend on CMake options rather
 than command line options

---
 libcxx/CMakeLists.txt                                 | 11 +++++------
 libcxx/include/__algorithm/pstl_backend.h             |  2 +-
 .../include/__algorithm/pstl_backends/gpu_backend.h   |  2 +-
 .../__algorithm/pstl_backends/gpu_backends/backend.h  |  8 ++++++--
 libcxx/include/__config_site.in                       |  1 +
 5 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 340666da76cfc0..3c27c1ea4861a9 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -303,12 +303,10 @@ else()
   set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
 endif()
 
-if (NOT DEFINED LIBCXX_PSTL_GPU_BACKEND)
-  if (${LIBCXX_ENABLE_GPU_OFFLOAD})
-    set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
-  else()
-    set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
-  endif()
+if (${LIBCXX_ENABLE_GPU_OFFLOAD})
+  set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
+else()
+  set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
 endif()
 
 # Misc options ----------------------------------------------------------------
@@ -792,6 +790,7 @@ else()
                        Valid backends are: serial, std_thread and libdispatch")
 endif()
 
+config_define_if(LIBCXX_ENABLE_GPU_OFFLOAD _LIBCPP_PSTL_GPU_OFFLOAD)
 if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload")
   config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
 endif()
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index 06e0790a283d55..e7e7244d5e9666 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -211,7 +211,7 @@ struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
-#    if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#    if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __gpu_backend_tag;
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index 7237036156a1bf..d2a814b441224a 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -13,7 +13,7 @@
 
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 
-#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 #  include <__algorithm/pstl_backends/gpu_backends/fill.h>
 #  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
 #endif
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
index a8b400afbb94d9..a03ad35d8d2ae3 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
@@ -12,8 +12,12 @@
 #include <__config>
 #include <cstddef>
 
-#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
-#  include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
+#  if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#    include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#  else
+#    error Invalid PSTL GPU backend
+#  endif
 #endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index e0edddce3afc3f..e7fb4f42307933 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -35,6 +35,7 @@
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
 #cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD
+#cmakedefine _LIBCPP_PSTL_GPU_OFFLOAD
 
 // Hardening.
 #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT

>From 2be1154685ab6a25ed4587ac6ca326dee0bbf04e Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 21 Sep 2023 17:07:58 -0700
Subject: [PATCH 04/65] Added OpenMP offloaded version of std::transform

---
 libcxx/include/CMakeLists.txt                 |   1 +
 .../__algorithm/pstl_backends/gpu_backend.h   |   1 +
 .../pstl_backends/gpu_backends/omp_offload.h  | 119 +++++++++++++++++-
 .../pstl_backends/gpu_backends/transform.h    | 117 +++++++++++++++++
 4 files changed, 233 insertions(+), 5 deletions(-)
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 7b40905144e336..841020cc8a23d9 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -90,6 +90,7 @@ set(files
   __algorithm/pstl_backends/gpu_backends/fill.h
   __algorithm/pstl_backends/gpu_backends/for_each.h
   __algorithm/pstl_backends/gpu_backends/omp_offload.h
+  __algorithm/pstl_backends/gpu_backends/transform.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index d2a814b441224a..dac26592dac5c1 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -16,6 +16,7 @@
 #if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 #  include <__algorithm/pstl_backends/gpu_backends/fill.h>
 #  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#  include <__algorithm/pstl_backends/gpu_backends/transform.h>
 #endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 4baa4e7f65859d..69221cbb851923 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -28,6 +28,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
+// Checking if a pointer is in a range
+template <typename T1, typename T2, typename T3>
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1 a, T2 p, T3 b) {
+  return false;
+}
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T* a, T* p, T* b) {
+  return std::less_equal<T*>{}(a, p) && std::less<T*>{}(p, b);
+}
+
 // In OpenMP, we need to extract the pointer for the underlying data for data
 // structures like std::vector and std::array to be able to map the data to the
 // device.
@@ -43,12 +54,16 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
   return PT.to_address(w);
 }
 
+//===----------------------------------------------------------------------===//
+// Templates for one iterator
+//===----------------------------------------------------------------------===//
+
 // Applying function or lambda in a loop
 
 template <class _Iterator, class _DifferenceType, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator
-__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n])
+__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f, const int __device = 0) noexcept {
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
 
@@ -66,9 +81,10 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
 // Assigning a value in a loop
 
 template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index
-__omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
-#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) map(to : __value)
+_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
+    _Index __first, _DifferenceType __n, const _Tp& __value, const int __device = 0) noexcept {
+#  pragma omp target teams distribute parallel for simd map(from : __first[0 : __n]) map(always, to : __value)         \
+      device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __first[__i] = __value;
 
@@ -82,6 +98,99 @@ __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __valu
   return __first + __n;
 }
 
+//===----------------------------------------------------------------------===//
+// Templates for two iterators
+//===----------------------------------------------------------------------===//
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
+    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f, const int __device = 0) noexcept {
+  if ((!std::is_same<_Iterator1, _Iterator2>::value) ||
+      (std::is_same<_Iterator1, _Iterator2>::value &&
+       !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first2, __first1 + __n))) {
+#  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n]) map(from : __first2[0 : __n])      \
+      device(__device)
+    for (_DifferenceType __i = 0; __i < __n; ++__i)
+      __f(__first1[__i], __first2[__i]);
+    return __first1 + __n;
+  }
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first1[0 : __n], __first2[0 : __n])             \
+      device(__device)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__first1[__i], __first2[__i]);
+
+  return __first1 + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1
+__parallel_for_simd_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
+  __omp_parallel_for_simd_2(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __n,
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __f);
+  return __first1 + __n;
+}
+
+//===----------------------------------------------------------------------===//
+// Templates for three iterator
+//===----------------------------------------------------------------------===//
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
+    _Iterator1 __first1,
+    _DifferenceType __n,
+    _Iterator2 __first2,
+    _Iterator3 __first3,
+    _Function __f,
+    const int __device = 0) noexcept {
+  // It may be that __first3 is in the interval [__first1+__n) or [__firt2+__n)
+  // It is, however, undefined behavior to compare two pointers that do not
+  // point to the same object or are not the same type.
+  // If we can prove that __first3 is not in any of the ranges [__first1+__n)
+  // or [__firt2+__n), it is safe to reduce the amount of data copied to and
+  // from the device
+  constexpr bool are_not_same_type =
+      !std::is_same<_Iterator1, _Iterator2>::value && !std::is_same<_Iterator1, _Iterator3>::value;
+  const bool no_overlap_13 =
+      std::is_same<_Iterator1, _Iterator3>::value &&
+      !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first3, __first1 + __n);
+  const bool no_overlap_23 =
+      std::is_same<_Iterator2, _Iterator3>::value &&
+      !__omp_gpu_backend::__omp_in_ptr_range(__first2, __first3, __first2 + __n);
+  if (are_not_same_type || (no_overlap_13 && no_overlap_23)) {
+#  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n], __first2[0 : __n])                 \
+      map(from : __first3[0 : __n]) device(__device)
+    for (_DifferenceType __i = 0; __i < __n; ++__i)
+      __f(__first1[__i], __first2[__i], __first3[__i]);
+    return __first1 + __n;
+  }
+  // In the general case, we have to map all data to and from the device
+#  pragma omp target teams distribute parallel for simd map(                                                           \
+          tofrom : __first1[0 : __n], __first2[0 : __n], __first3[0 : __n]) device(__device)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__first1[__i], __first2[__i], __first3[__i]);
+
+  return __first1 + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
+    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
+  __omp_parallel_for_simd_3(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __n,
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __omp_gpu_backend::__omp_extract_base_ptr(__first3),
+      __f);
+  return __first1 + __n;
+}
+
 } // namespace __omp_gpu_backend
 } // namespace __par_backend
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
new file mode 100644
index 00000000000000..03eba11a3f5f52
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -0,0 +1,117 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/transform.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__type_traits/remove_cvref.h>
+#include <__utility/terminate_on_exception.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+    __gpu_backend_tag,
+    _ForwardIterator __first,
+    _ForwardIterator __last,
+    _ForwardOutIterator __result,
+    _UnaryOperation __op) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    return std::__par_backend::__parallel_for_simd_2(
+        __first,
+        __last - __first,
+        __result,
+        [&](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
+          __out_value = __op(__in_value);
+        });
+  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    std::__terminate_on_exception([&] {
+      std::__par_backend::__parallel_for(
+          __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+            return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op);
+          });
+    });
+    return __result + (__last - __first);
+  } else {
+    return std::transform(__first, __last, __result, __op);
+  }
+}
+
+template <class _ExecutionPolicy,
+          class _ForwardIterator1,
+          class _ForwardIterator2,
+          class _ForwardOutIterator,
+          class _BinaryOperation,
+          enable_if_t<is_execution_policy_v<__remove_cvref_t<_ExecutionPolicy>>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+    __gpu_backend_tag,
+    _ForwardIterator1 __first1,
+    _ForwardIterator1 __last1,
+    _ForwardIterator2 __first2,
+    _ForwardOutIterator __result,
+    _BinaryOperation __op) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    return std::__par_backend::__parallel_for_simd_3(
+        __first1,
+        __last1 - __first1,
+        __first2,
+        __result,
+        [&](__iter_reference<_ForwardIterator1> __in1,
+            __iter_reference<_ForwardIterator2> __in2,
+            __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); });
+  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    std::__terminate_on_exception([&] {
+      std::__par_backend::__parallel_for(
+          __first1,
+          __last1,
+          [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) {
+            return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{},
+                __brick_first,
+                __brick_last,
+                __first2 + (__brick_first - __first1),
+                __result + (__brick_first - __first1),
+                __op);
+          });
+    });
+    return __result + (__last1 - __first1);
+  } else {
+    return std::transform(__first1, __last1, __first2, __result, __op);
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H

>From 2234b4363812290042360968ed26dc669b83bdeb Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 22 Sep 2023 11:55:53 -0700
Subject: [PATCH 05/65] Changing lambdas to capture by value in std::transform
 for GPUs

---
 .../__algorithm/pstl_backends/gpu_backends/transform.h    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index 03eba11a3f5f52..7fcfde44aaaa7a 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -38,11 +38,13 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    // While the CPU backend captures by reference, [&], that is not valid when
+    // offloading to the GPU. Therefore we must capture by value, [=].
     return std::__par_backend::__parallel_for_simd_2(
         __first,
         __last - __first,
         __result,
-        [&](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
+        [=](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
           __out_value = __op(__in_value);
         });
   } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
@@ -78,12 +80,14 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    // While the CPU backend captures by reference, [&], that is not valid when
+    // offloading to the GPU. Therefore we must capture by value, [=].
     return std::__par_backend::__parallel_for_simd_3(
         __first1,
         __last1 - __first1,
         __first2,
         __result,
-        [&](__iter_reference<_ForwardIterator1> __in1,
+        [=](__iter_reference<_ForwardIterator1> __in1,
             __iter_reference<_ForwardIterator2> __in2,
             __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); });
   } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&

>From ad6932827e2291ae95add79b9529aa4d54b12451 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 25 Sep 2023 13:13:39 -0700
Subject: [PATCH 06/65] GPU Offloading Implementation of std::transform_reduce

---
 libcxx/include/CMakeLists.txt                 |   1 +
 .../__algorithm/pstl_backends/gpu_backend.h   |   1 +
 .../pstl_backends/gpu_backends/omp_offload.h  | 113 ++++++++++++++
 .../gpu_backends/transform_reduce.h           | 147 ++++++++++++++++++
 4 files changed, 262 insertions(+)
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 841020cc8a23d9..278ed49a9f162b 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -91,6 +91,7 @@ set(files
   __algorithm/pstl_backends/gpu_backends/for_each.h
   __algorithm/pstl_backends/gpu_backends/omp_offload.h
   __algorithm/pstl_backends/gpu_backends/transform.h
+  __algorithm/pstl_backends/gpu_backends/transform_reduce.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index dac26592dac5c1..ea7f39dea90547 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -17,6 +17,7 @@
 #  include <__algorithm/pstl_backends/gpu_backends/fill.h>
 #  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
 #  include <__algorithm/pstl_backends/gpu_backends/transform.h>
+#  include <__algorithm/pstl_backends/gpu_backends/transform_reduce.h>
 #endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 69221cbb851923..d1cc6133f8e087 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -191,6 +191,119 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
   return __first1 + __n;
 }
 
+//===----------------------------------------------------------------------===//
+// Templates for reductions
+//===----------------------------------------------------------------------===//
+
+// General case
+
+#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                            \
+    template <class _Iterator,                                                                                                   \
+              class _DifferenceType,                                                                                             \
+              typename _Tp,                                                                                                      \
+              typename _BinaryOperationType,                                                                                     \
+              typename _UnaryOperation,                                                                                          \
+              __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 >                                                                     \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                               \
+        _Iterator __first,                                                                                                       \
+        _DifferenceType __n,                                                                                                     \
+        _Tp __init,                                                                                                              \
+        std_op<_BinaryOperationType> __reduce,                                                                                   \
+        _UnaryOperation __transform,                                                                                             \
+        const int __device = 0) noexcept {                                                                                       \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n]) device(__device)) \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                            \
+        __init = __reduce(__init, __transform(__first[__i]));                                                                    \
+      return __init;                                                                                                             \
+    }
+
+#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                                                \
+    template <class _Iterator1,                                                                                                                      \
+              class _Iterator2,                                                                                                                      \
+              class _DifferenceType,                                                                                                                 \
+              typename _Tp,                                                                                                                          \
+              typename _BinaryOperationType,                                                                                                         \
+              typename _UnaryOperation,                                                                                                              \
+              __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 >                                                                                         \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                                                   \
+        _Iterator1 __first1,                                                                                                                         \
+        _Iterator2 __first2,                                                                                                                         \
+        _DifferenceType __n,                                                                                                                         \
+        _Tp __init,                                                                                                                                  \
+        std_op<_BinaryOperationType> __reduce,                                                                                                       \
+        _UnaryOperation __transform,                                                                                                                 \
+        const int __device = 0) noexcept {                                                                                                           \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]) device(__device)) \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                                \
+        __init = __reduce(__init, __transform(__first1[__i], __first2[__i]));                                                                        \
+      return __init;                                                                                                                                 \
+    }
+
+#  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
+    __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                        \
+    __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
+
+// Addition
+__PSTL_OMP_SIMD_REDUCTION(+, std::plus)
+
+// Subtraction
+__PSTL_OMP_SIMD_REDUCTION(-, std::minus)
+
+// Multiplication
+__PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
+
+// Logical and
+__PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
+
+// Logical or
+__PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
+
+// Bitwise and
+__PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
+
+// Bitwise or
+__PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
+
+// Bitwise xor
+__PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
+
+// Extracting the underlying pointers
+
+template <class _Iterator, class _DifferenceType, typename _Tp, typename _BinaryOperation, typename _UnaryOperation >
+_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
+    _Iterator __first,
+    _DifferenceType __n,
+    _Tp __init,
+    _BinaryOperation __reduce,
+    _UnaryOperation __transform,
+    const int __device = 0) noexcept {
+  return __omp_parallel_for_simd_reduction_1(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
+}
+
+template <class _Iterator1,
+          class _Iterator2,
+          class _DifferenceType,
+          typename _Tp,
+          typename _BinaryOperation,
+          typename _UnaryOperation >
+_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_2(
+    _Iterator1 __first1,
+    _Iterator2 __first2,
+    _DifferenceType __n,
+    _Tp __init,
+    _BinaryOperation __reduce,
+    _UnaryOperation __transform,
+    const int __device = 0) noexcept {
+  return __omp_parallel_for_simd_reduction_2(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __n,
+      __init,
+      __reduce,
+      __transform);
+}
+
 } // namespace __omp_gpu_backend
 } // namespace __par_backend
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
new file mode 100644
index 00000000000000..43e5631aef04af
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -0,0 +1,147 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
+
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__numeric/transform_reduce.h>
+#include <__type_traits/is_arithmetic.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__type_traits/operation_traits.h>
+#include <__utility/move.h>
+#include <__utility/terminate_on_exception.h>
+#include <new>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+//===----------------------------------------------------------------------===//
+// Two input iterators
+//===----------------------------------------------------------------------===//
+
+template <class _ExecutionPolicy,
+          class _ForwardIterator1,
+          class _ForwardIterator2,
+          class _Tp,
+          class _BinaryOperation1,
+          class _BinaryOperation2>
+_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
+    __gpu_backend_tag,
+    _ForwardIterator1 __first1,
+    _ForwardIterator1 __last1,
+    _ForwardIterator2 __first2,
+    _Tp __init,
+    _BinaryOperation1 __reduce,
+    _BinaryOperation2 __transform) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
+    return std::__par_backend::__parallel_for_simd_reduction_2(
+        std::move(__first1),
+        std::move(__first2),
+        __last1 - __first1,
+        std::move(__init),
+        std::move(__reduce),
+        [=](__iter_reference<_ForwardIterator1> __in_value_1, __iter_reference<_ForwardIterator1> __in_value_2) {
+          return __transform(__in_value_1, __in_value_2);
+        });
+  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
+    return std::__terminate_on_exception([&] {
+      return __par_backend::__parallel_transform_reduce(
+          __first1,
+          std::move(__last1),
+          [__first1, __first2, __transform](_ForwardIterator1 __iter) {
+            return __transform(*__iter, *(__first2 + (__iter - __first1)));
+          },
+          std::move(__init),
+          std::move(__reduce),
+          [__first1, __first2, __reduce, __transform](
+              _ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last, _Tp __brick_init) {
+            return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{},
+                __brick_first,
+                std::move(__brick_last),
+                __first2 + (__brick_first - __first1),
+                std::move(__brick_init),
+                std::move(__reduce),
+                std::move(__transform));
+          });
+    });
+  } else {
+    return std::transform_reduce(
+        std::move(__first1),
+        std::move(__last1),
+        std::move(__first2),
+        std::move(__init),
+        std::move(__reduce),
+        std::move(__transform));
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// One input iterator
+//===----------------------------------------------------------------------===//
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
+_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
+    __gpu_backend_tag,
+    _ForwardIterator __first,
+    _ForwardIterator __last,
+    _Tp __init,
+    _BinaryOperation __reduce,
+    _UnaryOperation __transform) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    return std::__par_backend::__parallel_for_simd_reduction_1(
+        std::move(__first),
+        __last - __first,
+        std::move(__init),
+        std::move(__reduce),
+        [=](__iter_reference<_ForwardIterator> __in_value) { return __transform(__in_value); });
+  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    return std::__terminate_on_exception([&] {
+      return __par_backend::__parallel_transform_reduce(
+          std::move(__first),
+          std::move(__last),
+          [__transform](_ForwardIterator __iter) { return __transform(*__iter); },
+          std::move(__init),
+          __reduce,
+          [__transform, __reduce](auto __brick_first, auto __brick_last, _Tp __brick_init) {
+            return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{},
+                std::move(__brick_first),
+                std::move(__brick_last),
+                std::move(__brick_init),
+                std::move(__reduce),
+                std::move(__transform));
+          });
+    });
+  } else {
+    return std::transform_reduce(
+        std::move(__first), std::move(__last), std::move(__init), std::move(__reduce), std::move(__transform));
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H

>From d052f7b89be1ab1b0d7cda3f61c26576a9757930 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Tue, 26 Sep 2023 19:55:12 -0700
Subject: [PATCH 07/65] Fixed almost all test cases that failed during ninja
 check-cxx

---
 libcxx/include/CMakeLists.txt                 |   4 +
 .../__algorithm/pstl_backends/gpu_backend.h   |   4 +
 .../pstl_backends/gpu_backends/any_of.h       |  41 +++++++
 .../pstl_backends/gpu_backends/fill.h         |  20 +---
 .../pstl_backends/gpu_backends/find_if.h      |  44 ++++++++
 .../pstl_backends/gpu_backends/for_each.h     |  17 +--
 .../pstl_backends/gpu_backends/merge.h        |  51 +++++++++
 .../pstl_backends/gpu_backends/omp_offload.h  |  94 +++++++++-------
 .../pstl_backends/gpu_backends/stable_sort.h  |  38 +++++++
 .../pstl_backends/gpu_backends/transform.h    |  61 ++--------
 .../gpu_backends/transform_reduce.h           | 105 ++++++------------
 11 files changed, 292 insertions(+), 187 deletions(-)
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 278ed49a9f162b..e74985feff09e3 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -86,10 +86,14 @@ set(files
   __algorithm/pstl_backends/cpu_backends/transform.h
   __algorithm/pstl_backends/cpu_backends/transform_reduce.h
   __algorithm/pstl_backends/gpu_backend.h
+  __algorithm/pstl_backends/gpu_backends/any_of.h
   __algorithm/pstl_backends/gpu_backends/backend.h
   __algorithm/pstl_backends/gpu_backends/fill.h
+  __algorithm/pstl_backends/gpu_backends/find_if.h
   __algorithm/pstl_backends/gpu_backends/for_each.h
+  __algorithm/pstl_backends/gpu_backends/merge.h
   __algorithm/pstl_backends/gpu_backends/omp_offload.h
+  __algorithm/pstl_backends/gpu_backends/stable_sort.h
   __algorithm/pstl_backends/gpu_backends/transform.h
   __algorithm/pstl_backends/gpu_backends/transform_reduce.h
   __algorithm/pstl_copy.h
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index ea7f39dea90547..f41332fbf9f6d4 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -14,8 +14,12 @@
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 
 #if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
+#  include <__algorithm/pstl_backends/gpu_backends/any_of.h>
 #  include <__algorithm/pstl_backends/gpu_backends/fill.h>
+#  include <__algorithm/pstl_backends/gpu_backends/find_if.h>
 #  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#  include <__algorithm/pstl_backends/gpu_backends/merge.h>
+#  include <__algorithm/pstl_backends/gpu_backends/stable_sort.h>
 #  include <__algorithm/pstl_backends/gpu_backends/transform.h>
 #  include <__algorithm/pstl_backends/gpu_backends/transform_reduce.h>
 #endif
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
new file mode 100644
index 00000000000000..8d911de55dcd68
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
+
+#include <__algorithm/any_of.h>
+#include <__algorithm/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__atomic/atomic.h>
+#include <__atomic/memory_order.h>
+#include <__config>
+#include <__functional/operations.h>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/pair.h>
+#include <__utility/terminate_on_exception.h>
+#include <cstdint>
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI bool
+__pstl_any_of(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+  // TODO: Implement GPU backend
+  return std::__pstl_any_of<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index 32926da87e2a08..8dc6bc6a6179c0 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -14,6 +14,7 @@
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/terminate_on_exception.h>
 #include <stdio.h>
@@ -33,23 +34,12 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
   }
-  // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    std::__terminate_on_exception([&] {
-      __par_backend::__parallel_for(
-          __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
-            std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{}, __brick_first, __brick_last, __value);
-          });
-    });
-    // Else we execute for_each in serial
-  } else {
-    std::fill(__first, __last, __value);
-  }
+  // Otherwise, we execute for_each on the CPU instead
+  return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
new file mode 100644
index 00000000000000..2d34938f92dff3
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
+
+#include <__algorithm/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__atomic/atomic.h>
+#include <__config>
+#include <__functional/operations.h>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/pair.h>
+#include <__utility/terminate_on_exception.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI _ForwardIterator
+__pstl_find_if(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+  // TODO: Implement the GPU backend
+  return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index 14de2af8e4a15c..23c8da27e64ae3 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -33,23 +33,12 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
   // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    std::__terminate_on_exception([&] {
-      std::__par_backend::__parallel_for(
-          __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
-            std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{}, __brick_first, __brick_last, __func);
-          });
-    });
-    // Else we execute for_each in serial
-  } else {
-    std::for_each(__first, __last, __func);
-  }
+  return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
new file mode 100644
index 00000000000000..bc947ebb27ac7f
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
+
+#include <__algorithm/merge.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/move.h>
+#include <__utility/terminate_on_exception.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy,
+          class _ForwardIterator1,
+          class _ForwardIterator2,
+          class _ForwardOutIterator,
+          class _Comp>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
+    __gpu_backend_tag,
+    _ForwardIterator1 __first1,
+    _ForwardIterator1 __last1,
+    _ForwardIterator2 __first2,
+    _ForwardIterator2 __last2,
+    _ForwardOutIterator __result,
+    _Comp __comp) {
+  // TODO: Implement GPU backend
+  return std::__pstl_merge<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first1, __last1, __first2, __last2, __result, __comp);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index d1cc6133f8e087..36acafd448ec00 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -11,9 +11,19 @@
 
 #include <__assert>
 #include <__config>
+#include <__functional/operations.h>
+#include <__iterator/wrap_iter.h>
+#include <__memory/addressof.h>
+#include <__memory/pointer_traits.h>
+#include <__type_traits/is_pointer.h>
+#include <__type_traits/is_same.h>
 #include <__utility/move.h>
 #include <cstddef>
 
+// is_same
+
+// __libcpp_is_contiguous_iterator
+
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
@@ -30,27 +40,33 @@ inline namespace __omp_gpu_backend {
 
 // Checking if a pointer is in a range
 template <typename T1, typename T2, typename T3>
-_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1 a, T2 p, T3 b) {
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1, T2, T3) {
   return false;
 }
 
-template <typename T>
-_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T* a, T* p, T* b) {
-  return std::less_equal<T*>{}(a, p) && std::less<T*>{}(p, b);
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(_Tp* a, _Tp* p, _Tp* b) {
+  return std::less_equal<_Tp*>{}(a, p) && std::less<_Tp*>{}(p, b);
 }
 
 // In OpenMP, we need to extract the pointer for the underlying data for data
 // structures like std::vector and std::array to be able to map the data to the
 // device.
 
-template <typename T>
-_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(T p) {
+template <typename _Tp, std::enable_if<std::is_pointer<_Tp>::value >::type* = 0>
+_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(_Tp p) {
   return p;
 }
 
-template <typename T>
-_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
-  std::pointer_traits<std::__wrap_iter<T>> PT;
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) {
+  return std::addressof(*p);
+  ;
+}
+
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w) {
+  std::pointer_traits<std::__wrap_iter<_Tp>> PT;
   return PT.to_address(w);
 }
 
@@ -61,8 +77,8 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
 // Applying function or lambda in a loop
 
 template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator
-__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f, const int __device = 0) noexcept {
+_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(
+    _Iterator __first, _DifferenceType __n, _Function __f, [[maybe_unused]] const int __device = 0) noexcept {
 #  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
@@ -82,7 +98,7 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
 
 template <class _Index, class _DifferenceType, class _Tp>
 _LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
-    _Index __first, _DifferenceType __n, const _Tp& __value, const int __device = 0) noexcept {
+    _Index __first, _DifferenceType __n, const _Tp& __value, [[maybe_unused]] const int __device = 0) noexcept {
 #  pragma omp target teams distribute parallel for simd map(from : __first[0 : __n]) map(always, to : __value)         \
       device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
@@ -104,20 +120,24 @@ __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __valu
 
 template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
-    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f, const int __device = 0) noexcept {
+    _Iterator1 __first1,
+    _DifferenceType __n,
+    _Iterator2 __first2,
+    _Function __f,
+    [[maybe_unused]] const int __device = 0) noexcept {
   if ((!std::is_same<_Iterator1, _Iterator2>::value) ||
       (std::is_same<_Iterator1, _Iterator2>::value &&
        !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first2, __first1 + __n))) {
 #  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n]) map(from : __first2[0 : __n])      \
       device(__device)
     for (_DifferenceType __i = 0; __i < __n; ++__i)
-      __f(__first1[__i], __first2[__i]);
+      __first2[__i] = __f(__first1[__i]);
     return __first1 + __n;
   }
 #  pragma omp target teams distribute parallel for simd map(tofrom : __first1[0 : __n], __first2[0 : __n])             \
       device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(__first1[__i], __first2[__i]);
+    __first2[__i] = __f(__first1[__i]);
 
   return __first1 + __n;
 }
@@ -146,7 +166,7 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
     _Iterator2 __first2,
     _Iterator3 __first3,
     _Function __f,
-    const int __device = 0) noexcept {
+    [[maybe_unused]] const int __device = 0) noexcept {
   // It may be that __first3 is in the interval [__first1+__n) or [__firt2+__n)
   // It is, however, undefined behavior to compare two pointers that do not
   // point to the same object or are not the same type.
@@ -165,14 +185,14 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
 #  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n], __first2[0 : __n])                 \
       map(from : __first3[0 : __n]) device(__device)
     for (_DifferenceType __i = 0; __i < __n; ++__i)
-      __f(__first1[__i], __first2[__i], __first3[__i]);
+      __first3[__i] = __f(__first1[__i], __first2[__i]);
     return __first1 + __n;
   }
   // In the general case, we have to map all data to and from the device
 #  pragma omp target teams distribute parallel for simd map(                                                           \
           tofrom : __first1[0 : __n], __first2[0 : __n], __first3[0 : __n]) device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(__first1[__i], __first2[__i], __first3[__i]);
+    __first3[__i] = __f(__first1[__i], __first2[__i]);
 
   return __first1 + __n;
 }
@@ -197,46 +217,44 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
 
 // General case
 
-#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                            \
+#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                                 \
     template <class _Iterator,                                                                                                   \
               class _DifferenceType,                                                                                             \
               typename _Tp,                                                                                                      \
               typename _BinaryOperationType,                                                                                     \
-              typename _UnaryOperation,                                                                                          \
-              __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 >                                                                     \
+              typename _UnaryOperation>                                                                     \
     _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                               \
         _Iterator __first,                                                                                                       \
         _DifferenceType __n,                                                                                                     \
         _Tp __init,                                                                                                              \
         std_op<_BinaryOperationType> __reduce,                                                                                   \
-        _UnaryOperation __transform,                                                                                             \
-        const int __device = 0) noexcept {                                                                                       \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n]) device(__device)) \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                            \
-        __init = __reduce(__init, __transform(__first[__i]));                                                                    \
-      return __init;                                                                                                             \
+        _UnaryOperation __transform/*,                                                                                             \
+        [[maybe_unused]] const int __device = 0*/) noexcept {    \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n])) /*device(__device))*/ \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                 \
+        __init = __reduce(__init, __transform(__first[__i]));                                                                         \
+      return __init;                                                                                                                  \
     }
 
-#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                                                \
+#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                                                     \
     template <class _Iterator1,                                                                                                                      \
               class _Iterator2,                                                                                                                      \
               class _DifferenceType,                                                                                                                 \
               typename _Tp,                                                                                                                          \
               typename _BinaryOperationType,                                                                                                         \
-              typename _UnaryOperation,                                                                                                              \
-              __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 >                                                                                         \
+              typename _UnaryOperation >                                                                                         \
     _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                                                   \
         _Iterator1 __first1,                                                                                                                         \
         _Iterator2 __first2,                                                                                                                         \
         _DifferenceType __n,                                                                                                                         \
         _Tp __init,                                                                                                                                  \
         std_op<_BinaryOperationType> __reduce,                                                                                                       \
-        _UnaryOperation __transform,                                                                                                                 \
-        const int __device = 0) noexcept {                                                                                                           \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]) device(__device)) \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                                \
-        __init = __reduce(__init, __transform(__first1[__i], __first2[__i]));                                                                        \
-      return __init;                                                                                                                                 \
+        _UnaryOperation __transform/*,                                                                                                                 \
+        [[maybe_unused]] const int __device = 0*/) noexcept {    \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]))/* device(__device))*/ \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                                     \
+        __init = __reduce(__init, __transform(__first1[__i], __first2[__i]));                                                                             \
+      return __init;                                                                                                                                      \
     }
 
 #  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
@@ -276,7 +294,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
     _Tp __init,
     _BinaryOperation __reduce,
     _UnaryOperation __transform,
-    const int __device = 0) noexcept {
+    [[maybe_unused]] const int __device = 0) noexcept {
   return __omp_parallel_for_simd_reduction_1(
       __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
 }
@@ -294,7 +312,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_2(
     _Tp __init,
     _BinaryOperation __reduce,
     _UnaryOperation __transform,
-    const int __device = 0) noexcept {
+    [[maybe_unused]] const int __device = 0) noexcept {
   return __omp_parallel_for_simd_reduction_2(
       __omp_gpu_backend::__omp_extract_base_ptr(__first1),
       __omp_gpu_backend::__omp_extract_base_ptr(__first2),
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
new file mode 100644
index 00000000000000..1760a9fd9fc9d3
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
+
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/stable_sort.h>
+#include <__config>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_stable_sort(__gpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
+  // TODO: Implement GPU backend.
+  return __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index 7fcfde44aaaa7a..10f6e5ff174d67 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -37,30 +37,13 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     _UnaryOperation __op) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     // While the CPU backend captures by reference, [&], that is not valid when
     // offloading to the GPU. Therefore we must capture by value, [=].
-    return std::__par_backend::__parallel_for_simd_2(
-        __first,
-        __last - __first,
-        __result,
-        [=](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
-          __out_value = __op(__in_value);
-        });
-  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
-                       __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    std::__terminate_on_exception([&] {
-      std::__par_backend::__parallel_for(
-          __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
-            return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op);
-          });
-    });
-    return __result + (__last - __first);
-  } else {
-    return std::transform(__first, __last, __result, __op);
+    return std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
   }
+  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
 }
 
 template <class _ExecutionPolicy,
@@ -79,39 +62,15 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     // While the CPU backend captures by reference, [&], that is not valid when
     // offloading to the GPU. Therefore we must capture by value, [=].
-    return std::__par_backend::__parallel_for_simd_3(
-        __first1,
-        __last1 - __first1,
-        __first2,
-        __result,
-        [=](__iter_reference<_ForwardIterator1> __in1,
-            __iter_reference<_ForwardIterator2> __in2,
-            __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); });
-  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
-                       __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    std::__terminate_on_exception([&] {
-      std::__par_backend::__parallel_for(
-          __first1,
-          __last1,
-          [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) {
-            return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{},
-                __brick_first,
-                __brick_last,
-                __first2 + (__brick_first - __first1),
-                __result + (__brick_first - __first1),
-                __op);
-          });
-    });
-    return __result + (__last1 - __first1);
-  } else {
-    return std::transform(__first1, __last1, __first2, __result, __op);
+    return std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
   }
+  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
index 43e5631aef04af..8590dd3d024ea6 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -12,9 +12,11 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__config>
+#include <__functional/operations.h>
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
 #include <__numeric/transform_reduce.h>
+#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/operation_traits.h>
@@ -28,6 +30,25 @@
 
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+template <class _T1, class _T2, class _T3>
+struct __is_supported_reduction : std::false_type {};
+
+#  define __PSTL_IS_SUPPORTED_REDUCTION(funname)                                                                       \
+    template <class _Tp>                                                                                               \
+    struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};                                  \
+    template <class _Tp, class _Up>                                                                                    \
+    struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+
+// __is_trivial_plus_operation already exists
+__PSTL_IS_SUPPORTED_REDUCTION(plus)
+__PSTL_IS_SUPPORTED_REDUCTION(minus)
+__PSTL_IS_SUPPORTED_REDUCTION(multiplies)
+__PSTL_IS_SUPPORTED_REDUCTION(logical_and)
+__PSTL_IS_SUPPORTED_REDUCTION(logical_or)
+__PSTL_IS_SUPPORTED_REDUCTION(bit_and)
+__PSTL_IS_SUPPORTED_REDUCTION(bit_or)
+__PSTL_IS_SUPPORTED_REDUCTION(bit_xor)
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 //===----------------------------------------------------------------------===//
@@ -50,49 +71,16 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _BinaryOperation2 __transform) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+                (__is_trivial_plus_operation<_BinaryOperation1, _Tp, _Tp>::value ||
+                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
-        std::move(__first1),
-        std::move(__first2),
-        __last1 - __first1,
-        std::move(__init),
-        std::move(__reduce),
-        [=](__iter_reference<_ForwardIterator1> __in_value_1, __iter_reference<_ForwardIterator1> __in_value_2) {
-          return __transform(__in_value_1, __in_value_2);
-        });
-  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
-    return std::__terminate_on_exception([&] {
-      return __par_backend::__parallel_transform_reduce(
-          __first1,
-          std::move(__last1),
-          [__first1, __first2, __transform](_ForwardIterator1 __iter) {
-            return __transform(*__iter, *(__first2 + (__iter - __first1)));
-          },
-          std::move(__init),
-          std::move(__reduce),
-          [__first1, __first2, __reduce, __transform](
-              _ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last, _Tp __brick_init) {
-            return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{},
-                __brick_first,
-                std::move(__brick_last),
-                __first2 + (__brick_first - __first1),
-                std::move(__brick_init),
-                std::move(__reduce),
-                std::move(__transform));
-          });
-    });
-  } else {
-    return std::transform_reduce(
-        std::move(__first1),
-        std::move(__last1),
-        std::move(__first2),
-        std::move(__init),
-        std::move(__reduce),
-        std::move(__transform));
+        __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
   }
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first1, __last1, __first2, std::move(__init), __reduce, __transform);
 }
 
 //===----------------------------------------------------------------------===//
@@ -108,36 +96,15 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _BinaryOperation __reduce,
     _UnaryOperation __transform) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+                (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
+                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
-        std::move(__first),
-        __last - __first,
-        std::move(__init),
-        std::move(__reduce),
-        [=](__iter_reference<_ForwardIterator> __in_value) { return __transform(__in_value); });
-  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return std::__terminate_on_exception([&] {
-      return __par_backend::__parallel_transform_reduce(
-          std::move(__first),
-          std::move(__last),
-          [__transform](_ForwardIterator __iter) { return __transform(*__iter); },
-          std::move(__init),
-          __reduce,
-          [__transform, __reduce](auto __brick_first, auto __brick_last, _Tp __brick_init) {
-            return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{},
-                std::move(__brick_first),
-                std::move(__brick_last),
-                std::move(__brick_init),
-                std::move(__reduce),
-                std::move(__transform));
-          });
-    });
-  } else {
-    return std::transform_reduce(
-        std::move(__first), std::move(__last), std::move(__init), std::move(__reduce), std::move(__transform));
+        __first, __last - __first, __init, __reduce, __transform);
   }
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
 }
 
 _LIBCPP_END_NAMESPACE_STD

>From c60842303c27cc3766210d45519ab8404163ade5 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Tue, 26 Sep 2023 20:15:33 -0700
Subject: [PATCH 08/65] Missing return statements in fill and for_each

---
 libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h    | 2 +-
 .../include/__algorithm/pstl_backends/gpu_backends/for_each.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index 8dc6bc6a6179c0..d109495009df89 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -36,7 +36,7 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+    return std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
   }
   // Otherwise, we execute for_each on the CPU instead
   return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index 23c8da27e64ae3..bab0c87de8f2fc 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -35,7 +35,7 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+    return std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
   // Else if the excution policy is parallel, we execute for_each on the CPU instead
   return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);

>From 972fb9c55aaba949389cf3f241a2e83b6a75010d Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 27 Sep 2023 10:31:16 -0700
Subject: [PATCH 09/65] Passing all LIT tests

---
 .../__algorithm/pstl_backends/gpu_backends/fill.h | 10 ++++++----
 .../pstl_backends/gpu_backends/for_each.h         |  8 +++++---
 .../pstl_backends/gpu_backends/stable_sort.h      |  2 +-
 .../pstl_backends/gpu_backends/transform.h        | 15 ++++++++-------
 .../pstl_backends/gpu_backends/transform_reduce.h |  6 +++---
 5 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index d109495009df89..f32ee8b016b3ea 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -30,16 +30,18 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
-  // It is only safe to execute for_each on the GPU, it the execution policy is
+  // It is only safe to execute fill on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+    std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+  }
+  // Otherwise, we execute fill on the CPU instead
+  else {
+    std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
   }
-  // Otherwise, we execute for_each on the CPU instead
-  return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index bab0c87de8f2fc..f96b30b5ba25b2 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -35,10 +35,12 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+    std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+  }
+  // Else we fall back to the GPU backend
+  else {
+    std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
   }
-  // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
index 1760a9fd9fc9d3..5cd7081ef73e9c 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
@@ -28,7 +28,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_stable_sort(__gpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  return __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+  __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index 10f6e5ff174d67..c2e43cb6d64337 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -38,11 +38,12 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    // While the CPU backend captures by reference, [&], that is not valid when
-    // offloading to the GPU. Therefore we must capture by value, [=].
-    return std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
+    std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
+    return __result + (__last - __first);
   }
+  // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
 }
 
@@ -66,10 +67,10 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    // While the CPU backend captures by reference, [&], that is not valid when
-    // offloading to the GPU. Therefore we must capture by value, [=].
-    return std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
+    std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
+    return __result + (__last1 - __first1);
   }
+  // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
index 8590dd3d024ea6..332bb8abc1b8e0 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -31,13 +31,13 @@
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 template <class _T1, class _T2, class _T3>
-struct __is_supported_reduction : std::false_type {};
+_LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction : std::false_type {};
 
 #  define __PSTL_IS_SUPPORTED_REDUCTION(funname)                                                                       \
     template <class _Tp>                                                                                               \
-    struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};                                  \
+    _LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};            \
     template <class _Tp, class _Up>                                                                                    \
-    struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+    _LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
 
 // __is_trivial_plus_operation already exists
 __PSTL_IS_SUPPORTED_REDUCTION(plus)

>From 7bad6f4020fc06c4d55a158bb20ad9d2eb607258 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Tue, 3 Oct 2023 20:58:11 -0700
Subject: [PATCH 10/65] Handling value iterators in PSTL GPU backend for C++ 20

---
 .../pstl_backends/gpu_backends/omp_offload.h  | 219 ++++++++++--------
 .../pstl_backends/gpu_backends/transform.h    |   6 +
 .../gpu_backends/transform_reduce.h           |  24 +-
 3 files changed, 138 insertions(+), 111 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 36acafd448ec00..81ec1bbc63d008 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -12,6 +12,7 @@
 #include <__assert>
 #include <__config>
 #include <__functional/operations.h>
+#include <__iterator/iterator_traits.h>
 #include <__iterator/wrap_iter.h>
 #include <__memory/addressof.h>
 #include <__memory/pointer_traits.h>
@@ -38,36 +39,66 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
-// Checking if a pointer is in a range
-template <typename T1, typename T2, typename T3>
-_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1, T2, T3) {
-  return false;
+// Functions for eaxtracting the pase pointers
+
+// In the general case we do not need to extract it. This is for instance the
+// case for pointers.
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) noexcept {
+  return p;
 }
 
+// For vectors and arrays, etc, we need to extract the underlying base pointer.
 template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(_Tp* a, _Tp* p, _Tp* b) {
-  return std::less_equal<_Tp*>{}(a, p) && std::less<_Tp*>{}(p, b);
+_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w) noexcept {
+  std::pointer_traits<std::__wrap_iter<_Tp>> PT;
+  return PT.to_address(w);
 }
 
-// In OpenMP, we need to extract the pointer for the underlying data for data
-// structures like std::vector and std::array to be able to map the data to the
-// device.
+//===----------------------------------------------------------------------===//
+// The following four functions differentiates between contiguous iterators and
+// non-contiguous iterators. That allows to use the same implementations for
+// reference and value iterators
+//===----------------------------------------------------------------------===//
 
-template <typename _Tp, std::enable_if<std::is_pointer<_Tp>::value >::type* = 0>
-_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(_Tp p) {
-  return p;
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_to([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
+#  pragma omp target enter data map(to : p[0 : len])
+  } else {
+#  pragma omp target enter data map(to : p)
+  }
 }
 
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) {
-  return std::addressof(*p);
-  ;
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_from([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
+#  pragma omp target exit data map(from : p[0 : len])
+  } else {
+#  pragma omp target exit data map(release : p)
+  }
 }
 
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w) {
-  std::pointer_traits<std::__wrap_iter<_Tp>> PT;
-  return PT.to_address(w);
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_alloc([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
+#  pragma omp target enter data map(alloc : p[0 : len])
+  } else {
+#  pragma omp target enter data map(to : p)
+  }
+}
+
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
+#  pragma omp target exit data map(release : p[0 : len])
+  } else {
+#  pragma omp target exit data map(release : p)
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -79,9 +110,11 @@ _LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w)
 template <class _Iterator, class _DifferenceType, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(
     _Iterator __first, _DifferenceType __n, _Function __f, [[maybe_unused]] const int __device = 0) noexcept {
-#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) device(__device)
+  __omp_map_to(__first, __n);
+#  pragma omp target teams distribute parallel for simd device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
+  __omp_map_from(__first, __n);
 
   return __first + __n;
 }
@@ -99,11 +132,11 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
 template <class _Index, class _DifferenceType, class _Tp>
 _LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
     _Index __first, _DifferenceType __n, const _Tp& __value, [[maybe_unused]] const int __device = 0) noexcept {
-#  pragma omp target teams distribute parallel for simd map(from : __first[0 : __n]) map(always, to : __value)         \
-      device(__device)
+  __omp_map_alloc(__first, __n);
+#  pragma omp target teams distribute parallel for simd firstprivate(__value) device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __first[__i] = __value;
-
+  __omp_map_from(__first, __n);
   return __first + __n;
 }
 
@@ -125,20 +158,13 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
     _Iterator2 __first2,
     _Function __f,
     [[maybe_unused]] const int __device = 0) noexcept {
-  if ((!std::is_same<_Iterator1, _Iterator2>::value) ||
-      (std::is_same<_Iterator1, _Iterator2>::value &&
-       !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first2, __first1 + __n))) {
-#  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n]) map(from : __first2[0 : __n])      \
-      device(__device)
-    for (_DifferenceType __i = 0; __i < __n; ++__i)
-      __first2[__i] = __f(__first1[__i]);
-    return __first1 + __n;
-  }
-#  pragma omp target teams distribute parallel for simd map(tofrom : __first1[0 : __n], __first2[0 : __n])             \
-      device(__device)
+  __omp_map_alloc(__first2, __n);
+  __omp_map_to(__first1, __n);
+#  pragma omp target teams distribute parallel for simd device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __first2[__i] = __f(__first1[__i]);
-
+    *(__first2 + __i) = __f(*(__first1 + __i));
+  __omp_map_from(__first2, __n);
+  __omp_map_free(__first1, __n);
   return __first1 + __n;
 }
 
@@ -167,33 +193,15 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
     _Iterator3 __first3,
     _Function __f,
     [[maybe_unused]] const int __device = 0) noexcept {
-  // It may be that __first3 is in the interval [__first1+__n) or [__firt2+__n)
-  // It is, however, undefined behavior to compare two pointers that do not
-  // point to the same object or are not the same type.
-  // If we can prove that __first3 is not in any of the ranges [__first1+__n)
-  // or [__firt2+__n), it is safe to reduce the amount of data copied to and
-  // from the device
-  constexpr bool are_not_same_type =
-      !std::is_same<_Iterator1, _Iterator2>::value && !std::is_same<_Iterator1, _Iterator3>::value;
-  const bool no_overlap_13 =
-      std::is_same<_Iterator1, _Iterator3>::value &&
-      !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first3, __first1 + __n);
-  const bool no_overlap_23 =
-      std::is_same<_Iterator2, _Iterator3>::value &&
-      !__omp_gpu_backend::__omp_in_ptr_range(__first2, __first3, __first2 + __n);
-  if (are_not_same_type || (no_overlap_13 && no_overlap_23)) {
-#  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n], __first2[0 : __n])                 \
-      map(from : __first3[0 : __n]) device(__device)
-    for (_DifferenceType __i = 0; __i < __n; ++__i)
-      __first3[__i] = __f(__first1[__i], __first2[__i]);
-    return __first1 + __n;
-  }
-  // In the general case, we have to map all data to and from the device
-#  pragma omp target teams distribute parallel for simd map(                                                           \
-          tofrom : __first1[0 : __n], __first2[0 : __n], __first3[0 : __n]) device(__device)
+  __omp_map_to(__first1, __n);
+  __omp_map_to(__first2, __n);
+  __omp_map_alloc(__first3, __n);
+#  pragma omp target teams distribute parallel for simd device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __first3[__i] = __f(__first1[__i], __first2[__i]);
-
+    *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
+  __omp_map_free(__first1, __n);
+  __omp_map_free(__first2, __n);
+  __omp_map_from(__first3, __n);
   return __first1 + __n;
 }
 
@@ -215,47 +223,54 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
 // Templates for reductions
 //===----------------------------------------------------------------------===//
 
-// General case
-
-#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                                 \
-    template <class _Iterator,                                                                                                   \
-              class _DifferenceType,                                                                                             \
-              typename _Tp,                                                                                                      \
-              typename _BinaryOperationType,                                                                                     \
-              typename _UnaryOperation>                                                                     \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                               \
-        _Iterator __first,                                                                                                       \
-        _DifferenceType __n,                                                                                                     \
-        _Tp __init,                                                                                                              \
-        std_op<_BinaryOperationType> __reduce,                                                                                   \
-        _UnaryOperation __transform/*,                                                                                             \
-        [[maybe_unused]] const int __device = 0*/) noexcept {    \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n])) /*device(__device))*/ \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                 \
-        __init = __reduce(__init, __transform(__first[__i]));                                                                         \
-      return __init;                                                                                                                  \
+// In the two following function templates, we map the pointer to the device in
+// different ways depending on if they are contiguou or not.
+
+#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                  \
+    template <class _Iterator,                                                                                         \
+              class _DifferenceType,                                                                                   \
+              typename _Tp,                                                                                            \
+              typename _BinaryOperationType,                                                                           \
+              typename _UnaryOperation>                                                                                \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                     \
+        _Iterator __first,                                                                                             \
+        _DifferenceType __n,                                                                                           \
+        _Tp __init,                                                                                                    \
+        std_op<_BinaryOperationType> __reduce,                                                                         \
+        _UnaryOperation __transform,                                                                                   \
+        [[maybe_unused]] const int __device = 0) noexcept {                                                            \
+      __omp_map_to(__first, __n);                                                                                      \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) device(__device))                  \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
+        __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
+      __omp_map_free(__first, __n);                                                                                    \
+      return __init;                                                                                                   \
     }
 
-#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                                                     \
-    template <class _Iterator1,                                                                                                                      \
-              class _Iterator2,                                                                                                                      \
-              class _DifferenceType,                                                                                                                 \
-              typename _Tp,                                                                                                                          \
-              typename _BinaryOperationType,                                                                                                         \
-              typename _UnaryOperation >                                                                                         \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                                                   \
-        _Iterator1 __first1,                                                                                                                         \
-        _Iterator2 __first2,                                                                                                                         \
-        _DifferenceType __n,                                                                                                                         \
-        _Tp __init,                                                                                                                                  \
-        std_op<_BinaryOperationType> __reduce,                                                                                                       \
-        _UnaryOperation __transform/*,                                                                                                                 \
-        [[maybe_unused]] const int __device = 0*/) noexcept {    \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]))/* device(__device))*/ \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                                     \
-        __init = __reduce(__init, __transform(__first1[__i], __first2[__i]));                                                                             \
-      return __init;                                                                                                                                      \
-    }
+#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                  \
+    template <class _Iterator1,                                                                                        \
+              class _Iterator2,                                                                                        \
+              class _DifferenceType,                                                                                   \
+              typename _Tp,                                                                                            \
+              typename _BinaryOperationType,                                                                           \
+              typename _UnaryOperation >                                                                               \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                     \
+        _Iterator1 __first1,                                                                                           \
+        _Iterator2 __first2,                                                                                           \
+        _DifferenceType __n,                                                                                           \
+        _Tp __init,                                                                                                    \
+        std_op<_BinaryOperationType> __reduce,                                                                         \
+        _UnaryOperation __transform,                                                                                   \
+        [[maybe_unused]] const int __device = 0) noexcept {                                                            \
+      __omp_map_to(__first1, __n);                                                                                     \
+      __omp_map_to(__first2, __n);                                                                                     \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) device(__device))                  \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
+        __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
+      __omp_map_free(__first1, __n);                                                                                   \
+      __omp_map_free(__first2, __n);                                                                                   \
+      return __init;                                                                                                   \
+    } // namespace __omp_gpu_backend
 
 #  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
     __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                        \
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index c2e43cb6d64337..3af2106eb2bda2 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -35,10 +35,13 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     _ForwardIterator __last,
     _ForwardOutIterator __result,
     _UnaryOperation __op) {
+  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
+#  if _LIBCPP_STD_VER <= 17
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+#  endif
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
@@ -60,12 +63,15 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     _ForwardIterator2 __first2,
     _ForwardOutIterator __result,
     _BinaryOperation __op) {
+  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
+#  if _LIBCPP_STD_VER <= 17
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
+#  endif
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
index 332bb8abc1b8e0..eeffe62c040f08 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -30,14 +30,16 @@
 
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_BEGIN_NAMESPACE_STD
+
 template <class _T1, class _T2, class _T3>
-_LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction : std::false_type {};
+struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction : std::false_type {};
 
 #  define __PSTL_IS_SUPPORTED_REDUCTION(funname)                                                                       \
     template <class _Tp>                                                                                               \
-    _LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};            \
+    struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};            \
     template <class _Tp, class _Up>                                                                                    \
-    _LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+    struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
 
 // __is_trivial_plus_operation already exists
 __PSTL_IS_SUPPORTED_REDUCTION(plus)
@@ -49,8 +51,6 @@ __PSTL_IS_SUPPORTED_REDUCTION(bit_and)
 __PSTL_IS_SUPPORTED_REDUCTION(bit_or)
 __PSTL_IS_SUPPORTED_REDUCTION(bit_xor)
 
-_LIBCPP_BEGIN_NAMESPACE_STD
-
 //===----------------------------------------------------------------------===//
 // Two input iterators
 //===----------------------------------------------------------------------===//
@@ -69,11 +69,14 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _Tp __init,
     _BinaryOperation1 __reduce,
     _BinaryOperation2 __transform) {
+  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+#  if _LIBCPP_STD_VER <= 17
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
+#  endif
                 (__is_trivial_plus_operation<_BinaryOperation1, _Tp, _Tp>::value ||
                  __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
@@ -95,9 +98,12 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _Tp __init,
     _BinaryOperation __reduce,
     _UnaryOperation __transform) {
+  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+#  if _LIBCPP_STD_VER <= 17
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+#  endif
                 (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
                  __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_1(

>From 865b1eb1b5c42df701525643ca79ef3339d97da6 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 6 Oct 2023 11:42:25 -0700
Subject: [PATCH 11/65] Restructuring pstl_backends/gpu/ to
 pstl_backends/openmp

---
 libcxx/CMakeLists.txt                         | 36 ++++++++-----------
 libcxx/include/CMakeLists.txt                 | 22 ++++++------
 libcxx/include/__algorithm/pstl_backend.h     | 34 +++++++++++++-----
 .../__algorithm/pstl_backends/cpu_backend.h   | 23 +++++++-----
 .../pstl_backends/cpu_backends/backend.h      |  2 ++
 .../__algorithm/pstl_backends/gpu_backend.h   | 27 --------------
 .../__algorithm/pstl_backends/openmp.h        | 27 ++++++++++++++
 .../{gpu_backends => openmp}/any_of.h         | 13 ++++---
 .../{gpu_backends => openmp}/backend.h        | 16 +++------
 .../{gpu_backends => openmp}/fill.h           | 13 ++++---
 .../{gpu_backends => openmp}/find_if.h        | 13 ++++---
 .../{gpu_backends => openmp}/for_each.h       | 19 +++++-----
 .../{gpu_backends => openmp}/merge.h          | 14 ++++----
 .../{gpu_backends => openmp}/omp_offload.h    |  6 ++--
 .../{gpu_backends => openmp}/stable_sort.h    | 13 ++++---
 .../{gpu_backends => openmp}/transform.h      | 25 ++++++-------
 .../transform_reduce.h                        | 23 +++++-------
 libcxx/include/__config_site.in               |  3 +-
 18 files changed, 161 insertions(+), 168 deletions(-)
 delete mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backend.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/openmp.h
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/any_of.h (70%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/backend.h (58%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/fill.h (78%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/find_if.h (71%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/for_each.h (74%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/merge.h (73%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/omp_offload.h (98%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/stable_sort.h (68%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/transform.h (76%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/transform_reduce.h (85%)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 3c27c1ea4861a9..e8234d47d024bd 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -294,19 +294,14 @@ option(LIBCXX_HAS_WIN32_THREAD_API "Ignore auto-detection and force use of win32
 option(LIBCXX_HAS_EXTERNAL_THREAD_API
   "Build libc++ with an externalized threading API.
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
-option(LIBCXX_ENABLE_GPU_OFFLOAD 
-  "Build libc++ with support for GPU offload" OFF)
 
-if (LIBCXX_ENABLE_THREADS)
-  set(LIBCXX_PSTL_CPU_BACKEND "std_thread" CACHE STRING "Which PSTL CPU backend to use")
-else()
-  set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
-endif()
-
-if (${LIBCXX_ENABLE_GPU_OFFLOAD})
-  set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
-else()
-  set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
+set(LIBCXX_PSTL_BACKEND "openmp" CACHE INTERNAL "Which PSTL backend to use")
+if (LIBCXX_PSTL_BACKEND STREQUAL "")
+  if (LIBCXX_ENABLE_THREADS)
+    set(LIBCXX_PSTL_BACKEND "std-thread")
+  else()
+    set(LIBCXX_PSTL_BACKEND "serial")
+  endif()
 endif()
 
 # Misc options ----------------------------------------------------------------
@@ -779,20 +774,17 @@ elseif (LIBCXX_HARDENING_MODE STREQUAL "unchecked")
   config_define(0 _LIBCPP_ENABLE_DEBUG_MODE_DEFAULT)
 endif()
 
-if (LIBCXX_PSTL_CPU_BACKEND STREQUAL "serial")
+if (LIBCXX_PSTL_BACKEND STREQUAL "serial")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_SERIAL)
-elseif(LIBCXX_PSTL_CPU_BACKEND STREQUAL "std_thread")
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "std-thread")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_THREAD)
-elseif(LIBCXX_PSTL_CPU_BACKEND STREQUAL "libdispatch")
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+  config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
 else()
-  message(FATAL_ERROR "LIBCXX_PSTL_CPU_BACKEND is set to ${LIBCXX_PSTL_CPU_BACKEND}, which is not a valid backend.
-                       Valid backends are: serial, std_thread and libdispatch")
-endif()
-
-config_define_if(LIBCXX_ENABLE_GPU_OFFLOAD _LIBCPP_PSTL_GPU_OFFLOAD)
-if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload")
-  config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+  message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
+                       Valid backends are: serial, std-thread, libdispatch, and openmp.")
 endif()
 
 if (LIBCXX_ABI_DEFINES)
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index e74985feff09e3..7cc9d525ec05e6 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -85,17 +85,17 @@ set(files
   __algorithm/pstl_backends/cpu_backends/thread.h
   __algorithm/pstl_backends/cpu_backends/transform.h
   __algorithm/pstl_backends/cpu_backends/transform_reduce.h
-  __algorithm/pstl_backends/gpu_backend.h
-  __algorithm/pstl_backends/gpu_backends/any_of.h
-  __algorithm/pstl_backends/gpu_backends/backend.h
-  __algorithm/pstl_backends/gpu_backends/fill.h
-  __algorithm/pstl_backends/gpu_backends/find_if.h
-  __algorithm/pstl_backends/gpu_backends/for_each.h
-  __algorithm/pstl_backends/gpu_backends/merge.h
-  __algorithm/pstl_backends/gpu_backends/omp_offload.h
-  __algorithm/pstl_backends/gpu_backends/stable_sort.h
-  __algorithm/pstl_backends/gpu_backends/transform.h
-  __algorithm/pstl_backends/gpu_backends/transform_reduce.h
+  __algorithm/pstl_backends/openmp.h
+  __algorithm/pstl_backends/openmp/any_of.h
+  __algorithm/pstl_backends/openmp/backend.h
+  __algorithm/pstl_backends/openmp/fill.h
+  __algorithm/pstl_backends/openmp/find_if.h
+  __algorithm/pstl_backends/openmp/for_each.h
+  __algorithm/pstl_backends/openmp/merge.h
+  __algorithm/pstl_backends/openmp/omp_offload.h
+  __algorithm/pstl_backends/openmp/stable_sort.h
+  __algorithm/pstl_backends/openmp/transform.h
+  __algorithm/pstl_backends/openmp/transform_reduce.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index e7e7244d5e9666..b3f202765cd41b 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -10,7 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKEND_H
 
 #include <__algorithm/pstl_backends/cpu_backend.h>
-#include <__algorithm/pstl_backends/gpu_backend.h>
+#include <__algorithm/pstl_backends/openmp.h>
 #include <__config>
 #include <execution>
 
@@ -192,6 +192,9 @@ frontend will turn that into a call to `std::__throw_bad_alloc();` to report the
 template <class _ExecutionPolicy>
 struct __select_backend;
 
+#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                          \
+      defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+
 template <>
 struct __select_backend<std::execution::sequenced_policy> {
   using type = __cpu_backend_tag;
@@ -204,25 +207,40 @@ struct __select_backend<std::execution::unsequenced_policy> {
 };
 #  endif
 
-#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                          \
-      defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 template <>
 struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
-#    if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
-  using type = __gpu_backend_tag;
+  using type = __cpu_backend_tag;
 };
-#    else
+
+#  elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+
 template <>
-struct __select_backend<std::execution::parallel_unsequenced_policy> {
-  using type = __cpu_backend_tag;
+struct __select_backend<std::execution::sequenced_policy> {
+  using type = __omp_backend_tag;
+};
+
+#    if _LIBCPP_STD_VER >= 20
+template <>
+struct __select_backend<std::execution::unsequenced_policy> {
+  using type = __omp_backend_tag;
 };
 #    endif
 
+template <>
+struct __select_backend<std::execution::parallel_policy> {
+  using type = __omp_backend_tag;
+};
+
+template <>
+struct __select_backend<std::execution::parallel_unsequenced_policy> {
+  using type = __omp_backend_tag;
+};
+
 #  else
 
 // ...New vendors can add parallel backends here...
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
index 6980ded189ea2a..6de71ed702668b 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
@@ -55,14 +55,19 @@ into a program termination at the front-end level. When a backend returns a dise
 frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user.
 */
 
-#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/cpu_backends/fill.h>
-#include <__algorithm/pstl_backends/cpu_backends/find_if.h>
-#include <__algorithm/pstl_backends/cpu_backends/for_each.h>
-#include <__algorithm/pstl_backends/cpu_backends/merge.h>
-#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
-#include <__algorithm/pstl_backends/cpu_backends/transform.h>
-#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
+#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                            \
+    defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+
+#  include <__algorithm/pstl_backends/cpu_backends/any_of.h>
+#  include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#  include <__algorithm/pstl_backends/cpu_backends/fill.h>
+#  include <__algorithm/pstl_backends/cpu_backends/find_if.h>
+#  include <__algorithm/pstl_backends/cpu_backends/for_each.h>
+#  include <__algorithm/pstl_backends/cpu_backends/merge.h>
+#  include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
+#  include <__algorithm/pstl_backends/cpu_backends/transform.h>
+#  include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
+
+#endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
index ea2210a4a7adbd..b8e9b1e28201d1 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
@@ -18,6 +18,8 @@
 #  include <__algorithm/pstl_backends/cpu_backends/thread.h>
 #elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 #  include <__algorithm/pstl_backends/cpu_backends/libdispatch.h>
+#elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+// OpenMP covers both CPU and GPU backends
 #else
 #  error "Invalid CPU backend choice"
 #endif
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
deleted file mode 100644
index f41332fbf9f6d4..00000000000000
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ /dev/null
@@ -1,27 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
-
-#include <__config>
-
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
-
-#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
-#  include <__algorithm/pstl_backends/gpu_backends/any_of.h>
-#  include <__algorithm/pstl_backends/gpu_backends/fill.h>
-#  include <__algorithm/pstl_backends/gpu_backends/find_if.h>
-#  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
-#  include <__algorithm/pstl_backends/gpu_backends/merge.h>
-#  include <__algorithm/pstl_backends/gpu_backends/stable_sort.h>
-#  include <__algorithm/pstl_backends/gpu_backends/transform.h>
-#  include <__algorithm/pstl_backends/gpu_backends/transform_reduce.h>
-#endif
-
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
new file mode 100644
index 00000000000000..7787c82ff98825
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+
+#include <__config>
+
+#include <__algorithm/pstl_backends/openmp/backend.h>
+
+#if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+#  include <__algorithm/pstl_backends/openmp/any_of.h>
+#  include <__algorithm/pstl_backends/openmp/fill.h>
+#  include <__algorithm/pstl_backends/openmp/find_if.h>
+#  include <__algorithm/pstl_backends/openmp/for_each.h>
+#  include <__algorithm/pstl_backends/openmp/merge.h>
+#  include <__algorithm/pstl_backends/openmp/stable_sort.h>
+#  include <__algorithm/pstl_backends/openmp/transform.h>
+#  include <__algorithm/pstl_backends/openmp/transform_reduce.h>
+#endif
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
similarity index 70%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index 8d911de55dcd68..2b9a88fc58edcd 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -6,13 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_ANY_OF_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_ANY_OF_H
 
 #include <__algorithm/any_of.h>
 #include <__algorithm/find_if.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__atomic/atomic.h>
 #include <__atomic/memory_order.h>
 #include <__config>
@@ -29,13 +28,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI bool
-__pstl_any_of(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+__pstl_any_of(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement GPU backend
-  return std::__pstl_any_of<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+  return std::any_of(__first, __last, __pred);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_ANY_OF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
similarity index 58%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index a03ad35d8d2ae3..e4e6136082a342 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -6,19 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_BACKEND_H
 
 #include <__config>
 #include <cstddef>
 
-#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
-#  if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
-#    include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
-#  else
-#    error Invalid PSTL GPU backend
-#  endif
-#endif
+#include <__algorithm/pstl_backends/openmp/omp_offload.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -28,10 +22,10 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-struct __gpu_backend_tag {};
+struct __omp_backend_tag {};
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
similarity index 78%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index f32ee8b016b3ea..56341b04f0e50d 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FILL_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FILL_H
 
 #include <__algorithm/fill.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
@@ -29,7 +28,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI void
-__pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
+__pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   // It is only safe to execute fill on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
@@ -40,7 +39,7 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   }
   // Otherwise, we execute fill on the CPU instead
   else {
-    std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
+    std::fill(__first, __last, __value);
   }
 }
 
@@ -48,4 +47,4 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FILL_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
similarity index 71%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 2d34938f92dff3..1e78cb67cd7597 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FIND_IF_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FIND_IF_H
 
 #include <__algorithm/find_if.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__atomic/atomic.h>
 #include <__config>
 #include <__functional/operations.h>
@@ -32,13 +31,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI _ForwardIterator
-__pstl_find_if(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+__pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement the GPU backend
-  return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+  return std::find_if(__first, __last, __pred);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FIND_IF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
similarity index 74%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index f96b30b5ba25b2..401cdfade50a10 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FOR_EACH_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FOR_EACH_H
 
 #include <__algorithm/for_each.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
@@ -28,7 +27,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI void
-__pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
+__pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
   // It is only safe to execute for_each on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
@@ -36,10 +35,10 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
-  }
-  // Else we fall back to the GPU backend
-  else {
-    std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
+  } else {
+    // If it is not safe to offload to the GPU, we call the serial
+    // implementation
+    std::for_each(__first, __last, __func);
   }
 }
 
@@ -47,4 +46,4 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FOR_EACH_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
similarity index 73%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/merge.h
index bc947ebb27ac7f..23036e1cc85398 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_MERGE_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_MERGE_H
 
 #include <__algorithm/merge.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
@@ -32,7 +31,7 @@ template <class _ExecutionPolicy,
           class _ForwardOutIterator,
           class _Comp>
 _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
     _ForwardIterator2 __first2,
@@ -40,12 +39,11 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
     _ForwardOutIterator __result,
     _Comp __comp) {
   // TODO: Implement GPU backend
-  return std::__pstl_merge<_ExecutionPolicy>(
-      __cpu_backend_tag{}, __first1, __last1, __first2, __last2, __result, __comp);
+  return std::merge(__first1, __last1, __first2, __last2, __result, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_MERGE_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
similarity index 98%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 81ec1bbc63d008..ee0bfe13bac901 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 
 #include <__assert>
 #include <__config>
@@ -346,4 +346,4 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
similarity index 68%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index 5cd7081ef73e9c..7d28b8de77983a 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
 
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/stable_sort.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
@@ -26,13 +25,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI void
-__pstl_stable_sort(__gpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
+__pstl_stable_sort(__omp_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+  std::stable_sort(__first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
similarity index 76%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index 3af2106eb2bda2..6dc7ab22c98cc3 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
 
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/transform.h>
 #include <__config>
 #include <__iterator/concepts.h>
@@ -30,7 +29,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
 _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator __first,
     _ForwardIterator __last,
     _ForwardOutIterator __result,
@@ -46,8 +45,8 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
   }
-  // If it is not safe to offload to the GPU, we rely on the CPU backend.
-  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
+  // If it is not safe to offload to the GPU, we rely on the serial backend.
+  return std::transform(__first, __last, __result, __op);
 }
 
 template <class _ExecutionPolicy,
@@ -57,7 +56,7 @@ template <class _ExecutionPolicy,
           class _BinaryOperation,
           enable_if_t<is_execution_policy_v<__remove_cvref_t<_ExecutionPolicy>>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
     _ForwardIterator2 __first2,
@@ -68,20 +67,16 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
-#  if _LIBCPP_STD_VER <= 17
-                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
-#  endif
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
   }
-  // If it is not safe to offload to the GPU, we rely on the CPU backend.
-  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
+  // If it is not safe to offload to the GPU, we call the serial implementation
+  return std::transform(__first1, __last1, __first2, __result, __op);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
similarity index 85%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index eeffe62c040f08..c63e456b314c1c 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
 
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/concepts.h>
@@ -62,7 +61,7 @@ template <class _ExecutionPolicy,
           class _BinaryOperation1,
           class _BinaryOperation2>
 _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
     _ForwardIterator2 __first2,
@@ -82,8 +81,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
   }
-  return std::__pstl_transform_reduce<_ExecutionPolicy>(
-      __cpu_backend_tag{}, __first1, __last1, __first2, std::move(__init), __reduce, __transform);
+  return std::transform_reduce(__first1, __last1, __first2, std::move(__init), __reduce, __transform);
 }
 
 //===----------------------------------------------------------------------===//
@@ -92,7 +90,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
 _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator __first,
     _ForwardIterator __last,
     _Tp __init,
@@ -101,20 +99,17 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
   // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
-#  if _LIBCPP_STD_VER <= 17
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
-#  endif
                 (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
                  __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);
   }
-  return std::__pstl_transform_reduce<_ExecutionPolicy>(
-      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
+  // If it is not safe to offload to the GPU, we call the serial implementation
+  return std::transform_reduce(__first, __last, std::move(__init), __reduce, __transform);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index e7fb4f42307933..21334e797b8963 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -34,8 +34,7 @@
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_SERIAL
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
-#cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD
-#cmakedefine _LIBCPP_PSTL_GPU_OFFLOAD
+#cmakedefine _LIBCPP_PSTL_BACKEND_OPENMP
 
 // Hardening.
 #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT

>From 3c1ca8b4545918b240eb45b2f96ccc4c8acfc2b8 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 6 Oct 2023 12:39:29 -0700
Subject: [PATCH 12/65] Configured PSTL OpenMP backend to use the serial CPU
 backend when it is not safe to offload

---
 libcxx/include/__algorithm/pstl_backend.h      | 18 +++---------------
 .../__algorithm/pstl_backends/cpu_backend.h    |  5 -----
 .../pstl_backends/cpu_backends/backend.h       |  4 +---
 .../include/__algorithm/pstl_backends/openmp.h |  6 ++----
 .../__algorithm/pstl_backends/openmp/any_of.h  |  3 ++-
 .../__algorithm/pstl_backends/openmp/fill.h    |  3 ++-
 .../__algorithm/pstl_backends/openmp/find_if.h |  3 ++-
 .../pstl_backends/openmp/for_each.h            |  9 +++++----
 .../__algorithm/pstl_backends/openmp/merge.h   |  4 +++-
 .../pstl_backends/openmp/stable_sort.h         |  3 ++-
 .../pstl_backends/openmp/transform.h           | 12 +++++-------
 .../pstl_backends/openmp/transform_reduce.h    | 12 +++++-------
 12 files changed, 32 insertions(+), 50 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index b3f202765cd41b..dca9a29ce25d54 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -192,9 +192,6 @@ frontend will turn that into a call to `std::__throw_bad_alloc();` to report the
 template <class _ExecutionPolicy>
 struct __select_backend;
 
-#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                          \
-      defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
-
 template <>
 struct __select_backend<std::execution::sequenced_policy> {
   using type = __cpu_backend_tag;
@@ -207,6 +204,9 @@ struct __select_backend<std::execution::unsequenced_policy> {
 };
 #  endif
 
+#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                          \
+      defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+
 template <>
 struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
@@ -219,18 +219,6 @@ struct __select_backend<std::execution::parallel_unsequenced_policy> {
 
 #  elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 
-template <>
-struct __select_backend<std::execution::sequenced_policy> {
-  using type = __omp_backend_tag;
-};
-
-#    if _LIBCPP_STD_VER >= 20
-template <>
-struct __select_backend<std::execution::unsequenced_policy> {
-  using type = __omp_backend_tag;
-};
-#    endif
-
 template <>
 struct __select_backend<std::execution::parallel_policy> {
   using type = __omp_backend_tag;
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
index 6de71ed702668b..5e9781d9bc59c1 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
@@ -55,9 +55,6 @@ into a program termination at the front-end level. When a backend returns a dise
 frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user.
 */
 
-#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                            \
-    defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
-
 #  include <__algorithm/pstl_backends/cpu_backends/any_of.h>
 #  include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #  include <__algorithm/pstl_backends/cpu_backends/fill.h>
@@ -68,6 +65,4 @@ frontend will turn that into a call to `std::__throw_bad_alloc();` to report the
 #  include <__algorithm/pstl_backends/cpu_backends/transform.h>
 #  include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
 
-#endif
-
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
index b8e9b1e28201d1..51aa878d734514 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
@@ -12,14 +12,12 @@
 #include <__config>
 #include <cstddef>
 
-#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL)
+#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 #  include <__algorithm/pstl_backends/cpu_backends/serial.h>
 #elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD)
 #  include <__algorithm/pstl_backends/cpu_backends/thread.h>
 #elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 #  include <__algorithm/pstl_backends/cpu_backends/libdispatch.h>
-#elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
-// OpenMP covers both CPU and GPU backends
 #else
 #  error "Invalid CPU backend choice"
 #endif
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index 7787c82ff98825..4c121e187ae246 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -9,11 +9,10 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
 
-#include <__config>
+#  include <__config>
 
-#include <__algorithm/pstl_backends/openmp/backend.h>
+#  include <__algorithm/pstl_backends/openmp/backend.h>
 
-#if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 #  include <__algorithm/pstl_backends/openmp/any_of.h>
 #  include <__algorithm/pstl_backends/openmp/fill.h>
 #  include <__algorithm/pstl_backends/openmp/find_if.h>
@@ -22,6 +21,5 @@
 #  include <__algorithm/pstl_backends/openmp/stable_sort.h>
 #  include <__algorithm/pstl_backends/openmp/transform.h>
 #  include <__algorithm/pstl_backends/openmp/transform_reduce.h>
-#endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index 2b9a88fc58edcd..a36f545d5bd719 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -11,6 +11,7 @@
 
 #include <__algorithm/any_of.h>
 #include <__algorithm/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__atomic/atomic.h>
 #include <__atomic/memory_order.h>
@@ -30,7 +31,7 @@ template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI bool
 __pstl_any_of(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement GPU backend
-  return std::any_of(__first, __last, __pred);
+  return std::__pstl_any_of<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 56341b04f0e50d..6098769a4fedb2 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FILL_H
 
 #include <__algorithm/fill.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
@@ -39,7 +40,7 @@ __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   }
   // Otherwise, we execute fill on the CPU instead
   else {
-    std::fill(__first, __last, __value);
+    std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
   }
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 1e78cb67cd7597..6cd3b808e0700c 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FIND_IF_H
 
 #include <__algorithm/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__atomic/atomic.h>
 #include <__config>
@@ -33,7 +34,7 @@ template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI _ForwardIterator
 __pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement the GPU backend
-  return std::find_if(__first, __last, __pred);
+  return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index 401cdfade50a10..c1a159829989bb 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FOR_EACH_H
 
 #include <__algorithm/for_each.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
@@ -35,10 +36,10 @@ __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
-  } else {
-    // If it is not safe to offload to the GPU, we call the serial
-    // implementation
-    std::for_each(__first, __last, __func);
+  }
+  // Else we fall back to the serial backend
+  else {
+    std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
   }
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
index 23036e1cc85398..b6587b161d1548 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_MERGE_H
 
 #include <__algorithm/merge.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
@@ -39,7 +40,8 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
     _ForwardOutIterator __result,
     _Comp __comp) {
   // TODO: Implement GPU backend
-  return std::merge(__first1, __last1, __first2, __last2, __result, __comp);
+  return std::__pstl_merge<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first1, __last1, __first2, __last2, __result, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index 7d28b8de77983a..ac9323b04f63f9 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
 
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/stable_sort.h>
 #include <__config>
@@ -27,7 +28,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_stable_sort(__omp_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  std::stable_sort(__first, __last, __comp);
+  __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index 6dc7ab22c98cc3..f855c5146e0f39 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
 
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/transform.h>
 #include <__config>
@@ -38,15 +39,12 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
-#  if _LIBCPP_STD_VER <= 17
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
-#  endif
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
   }
-  // If it is not safe to offload to the GPU, we rely on the serial backend.
-  return std::transform(__first, __last, __result, __op);
+  // If it is not safe to offload to the GPU, we rely on the CPU backend.
+  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
 }
 
 template <class _ExecutionPolicy,
@@ -71,8 +69,8 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
   }
-  // If it is not safe to offload to the GPU, we call the serial implementation
-  return std::transform(__first1, __last1, __first2, __result, __op);
+  // If it is not safe to offload to the GPU, we rely on the CPU backend.
+  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index c63e456b314c1c..b34d2be79921a7 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
 
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__functional/operations.h>
@@ -72,16 +73,13 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
-#  if _LIBCPP_STD_VER <= 17
-                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
-#  endif
                 (__is_trivial_plus_operation<_BinaryOperation1, _Tp, _Tp>::value ||
                  __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
   }
-  return std::transform_reduce(__first1, __last1, __first2, std::move(__init), __reduce, __transform);
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first1, __last1, __first2, std::move(__init), __reduce, __transform);
 }
 
 //===----------------------------------------------------------------------===//
@@ -104,8 +102,8 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);
   }
-  // If it is not safe to offload to the GPU, we call the serial implementation
-  return std::transform_reduce(__first, __last, std::move(__init), __reduce, __transform);
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
 }
 
 _LIBCPP_END_NAMESPACE_STD

>From cd4d5d1d4b92c3df04adc79be070d3dae9fcf9f1 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 9 Oct 2023 11:48:58 -0700
Subject: [PATCH 13/65] Clang-formatted cpu_backend.h and openmp.h

---
 .../__algorithm/pstl_backends/cpu_backend.h   | 18 ++++++++---------
 .../__algorithm/pstl_backends/openmp.h        | 20 +++++++++----------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
index 5e9781d9bc59c1..6980ded189ea2a 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
@@ -55,14 +55,14 @@ into a program termination at the front-end level. When a backend returns a dise
 frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user.
 */
 
-#  include <__algorithm/pstl_backends/cpu_backends/any_of.h>
-#  include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#  include <__algorithm/pstl_backends/cpu_backends/fill.h>
-#  include <__algorithm/pstl_backends/cpu_backends/find_if.h>
-#  include <__algorithm/pstl_backends/cpu_backends/for_each.h>
-#  include <__algorithm/pstl_backends/cpu_backends/merge.h>
-#  include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
-#  include <__algorithm/pstl_backends/cpu_backends/transform.h>
-#  include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
+#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/cpu_backends/fill.h>
+#include <__algorithm/pstl_backends/cpu_backends/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/for_each.h>
+#include <__algorithm/pstl_backends/cpu_backends/merge.h>
+#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
+#include <__algorithm/pstl_backends/cpu_backends/transform.h>
+#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index 4c121e187ae246..6afc43f1faf356 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -9,17 +9,17 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
 
-#  include <__config>
+#include <__config>
 
-#  include <__algorithm/pstl_backends/openmp/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 
-#  include <__algorithm/pstl_backends/openmp/any_of.h>
-#  include <__algorithm/pstl_backends/openmp/fill.h>
-#  include <__algorithm/pstl_backends/openmp/find_if.h>
-#  include <__algorithm/pstl_backends/openmp/for_each.h>
-#  include <__algorithm/pstl_backends/openmp/merge.h>
-#  include <__algorithm/pstl_backends/openmp/stable_sort.h>
-#  include <__algorithm/pstl_backends/openmp/transform.h>
-#  include <__algorithm/pstl_backends/openmp/transform_reduce.h>
+#include <__algorithm/pstl_backends/openmp/any_of.h>
+#include <__algorithm/pstl_backends/openmp/fill.h>
+#include <__algorithm/pstl_backends/openmp/find_if.h>
+#include <__algorithm/pstl_backends/openmp/for_each.h>
+#include <__algorithm/pstl_backends/openmp/merge.h>
+#include <__algorithm/pstl_backends/openmp/stable_sort.h>
+#include <__algorithm/pstl_backends/openmp/transform.h>
+#include <__algorithm/pstl_backends/openmp/transform_reduce.h>
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H

>From 5bf3b59a994fbd35167c2ad55c64ba14d8269b33 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 9 Oct 2023 17:08:00 -0700
Subject: [PATCH 14/65] Removing unnecessary includes and redundant iterator
 checks

---
 libcxx/include/__algorithm/pstl_backend.h       |  2 +-
 .../include/__algorithm/pstl_backends/openmp.h  |  6 +++---
 .../__algorithm/pstl_backends/openmp/any_of.h   |  8 --------
 .../__algorithm/pstl_backends/openmp/fill.h     |  9 ++-------
 .../__algorithm/pstl_backends/openmp/find_if.h  |  8 --------
 .../__algorithm/pstl_backends/openmp/for_each.h |  4 ----
 .../__algorithm/pstl_backends/openmp/merge.h    |  4 ----
 .../pstl_backends/openmp/stable_sort.h          |  2 --
 .../pstl_backends/openmp/transform.h            |  6 ------
 .../pstl_backends/openmp/transform_reduce.h     | 17 +++++------------
 10 files changed, 11 insertions(+), 55 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index dca9a29ce25d54..e2705f2f9c8154 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -221,7 +221,7 @@ struct __select_backend<std::execution::parallel_unsequenced_policy> {
 
 template <>
 struct __select_backend<std::execution::parallel_policy> {
-  using type = __omp_backend_tag;
+  using type = __cpu_backend_tag;
 };
 
 template <>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index 6afc43f1faf356..9e5490db655f4b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_H
 
 #include <__config>
 
@@ -22,4 +22,4 @@
 #include <__algorithm/pstl_backends/openmp/transform.h>
 #include <__algorithm/pstl_backends/openmp/transform_reduce.h>
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index a36f545d5bd719..beae485f41f248 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -10,18 +10,10 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_ANY_OF_H
 
 #include <__algorithm/any_of.h>
-#include <__algorithm/find_if.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
-#include <__atomic/atomic.h>
-#include <__atomic/memory_order.h>
 #include <__config>
-#include <__functional/operations.h>
-#include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__utility/pair.h>
-#include <__utility/terminate_on_exception.h>
-#include <cstdint>
 
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 6098769a4fedb2..19a3ba3f7be07b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -13,11 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
-#include <__iterator/concepts.h>
-#include <__iterator/iterator_traits.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__utility/terminate_on_exception.h>
-#include <stdio.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -31,10 +27,9 @@ template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   // It is only safe to execute fill on the GPU, it the execution policy is
-  // parallel unsequenced, as it is the only execution policy prohibiting throwing
-  // exceptions and allowing SIMD instructions
+  // parallel unsequenced, as it is the only execution policy allowing
+  // SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 6cd3b808e0700c..4526d3976567a0 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -12,15 +12,7 @@
 #include <__algorithm/find_if.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
-#include <__atomic/atomic.h>
 #include <__config>
-#include <__functional/operations.h>
-#include <__iterator/concepts.h>
-#include <__iterator/iterator_traits.h>
-#include <__type_traits/is_execution_policy.h>
-#include <__utility/pair.h>
-#include <__utility/terminate_on_exception.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index c1a159829989bb..30c2e5cca2f264 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -13,10 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
-#include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__utility/terminate_on_exception.h>
-#include <stdio.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -33,7 +30,6 @@ __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
index b6587b161d1548..666c3a0e000193 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
@@ -13,10 +13,6 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
-#include <__iterator/concepts.h>
-#include <__type_traits/is_execution_policy.h>
-#include <__utility/move.h>
-#include <__utility/terminate_on_exception.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index ac9323b04f63f9..46e48dd279e89b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -13,8 +13,6 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/stable_sort.h>
 #include <__config>
-#include <__type_traits/is_execution_policy.h>
-#include <__utility/terminate_on_exception.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index f855c5146e0f39..e507a16eac965b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -14,11 +14,7 @@
 #include <__algorithm/transform.h>
 #include <__config>
 #include <__iterator/concepts.h>
-#include <__iterator/iterator_traits.h>
-#include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__type_traits/remove_cvref.h>
-#include <__utility/terminate_on_exception.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -38,7 +34,6 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
@@ -64,7 +59,6 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index b34d2be79921a7..eb0a7ba52d8fdc 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -12,17 +12,12 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
-#include <__functional/operations.h>
 #include <__iterator/concepts.h>
-#include <__iterator/iterator_traits.h>
 #include <__numeric/transform_reduce.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/operation_traits.h>
-#include <__utility/move.h>
-#include <__utility/terminate_on_exception.h>
-#include <new>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -33,13 +28,13 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _T1, class _T2, class _T3>
-struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction : std::false_type {};
+struct __is_supported_reduction : std::false_type {};
 
 #  define __PSTL_IS_SUPPORTED_REDUCTION(funname)                                                                       \
     template <class _Tp>                                                                                               \
-    struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};            \
+    struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};                                  \
     template <class _Tp, class _Up>                                                                                    \
-    struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+    struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
 
 // __is_trivial_plus_operation already exists
 __PSTL_IS_SUPPORTED_REDUCTION(plus)
@@ -73,8 +68,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
-                (__is_trivial_plus_operation<_BinaryOperation1, _Tp, _Tp>::value ||
-                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)) {
+                __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
   }
@@ -97,8 +91,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
   // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
-                (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
-                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)) {
+                __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);
   }

>From 912626fb2eca3c5318b8f01a032f13692e2c5111 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 16 Oct 2023 01:06:48 -0700
Subject: [PATCH 15/65] Adding optional to OpenMP backend and other
 refactorings requested during code review

---
 .../__algorithm/pstl_backends/openmp/any_of.h |   3 +-
 .../__algorithm/pstl_backends/openmp/fill.h   |  32 ++-
 .../pstl_backends/openmp/find_if.h            |  39 +++-
 .../pstl_backends/openmp/for_each.h           |  33 +++-
 .../__algorithm/pstl_backends/openmp/merge.h  |   3 +-
 .../pstl_backends/openmp/omp_offload.h        | 185 +++---------------
 .../pstl_backends/openmp/stable_sort.h        |   6 +-
 .../pstl_backends/openmp/transform.h          |  93 ++++++++-
 .../pstl_backends/openmp/transform_reduce.h   |  16 +-
 9 files changed, 225 insertions(+), 185 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index beae485f41f248..65f2294ff2ee5f 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -14,13 +14,14 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI bool
+_LIBCPP_HIDE_FROM_ABI optional<bool>
 __pstl_any_of(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement GPU backend
   return std::__pstl_any_of<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 19a3ba3f7be07b..250e514b4d526f 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -14,6 +14,8 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
+#include <__utility/empty.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -23,19 +25,43 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+template <class _Tp, class _DifferenceType, class _Up>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd_val(
+    _Tp* __first, _DifferenceType __n, const _Up& __value) noexcept {
+  __omp_gpu_backend::__omp_map_alloc(__first, __n);
+#  pragma omp target teams distribute parallel for simd firstprivate(__value)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__first+__i) = __value;
+  __omp_gpu_backend::__omp_map_from(__first, __n);
+  return __empty{};
+}
+
+template <class _ForwardIterator, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__parallel_for_simd_val(_ForwardIterator __first, _DifferenceType __n, const _Tp& __value) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd_val(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
+}
+
+}
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
-_LIBCPP_HIDE_FROM_ABI void
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   // It is only safe to execute fill on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy allowing
   // SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+    return std::__par_backend::__parallel_for_simd_val(__first, __last - __first, __value);
   }
   // Otherwise, we execute fill on the CPU instead
   else {
-    std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
+    return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
   }
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 4526d3976567a0..94f7c42953c2a4 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -22,11 +23,43 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+template <class _Tp, class _DifferenceType, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI _DifferenceType  __omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
+  __omp_gpu_backend::__omp_map_to(__first, __n);
+  _DifferenceType idx = __n;
+#  pragma omp target teams distribute parallel for simd reduction(min:idx)
+  for (_DifferenceType __i = 0; __i < __n; ++__i){
+    if (__pred(*(__first+__i))) {
+      idx = (__i < idx) ? __i : idx;
+    }
+  }
+  __omp_gpu_backend::__omp_map_free(__first, __n);
+  return idx;
+}
+
+// Extracting the underlying pointer
+
+template <class _ForwardIterator, class _DifferenceType, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator> __parallel_for_min_idx(_ForwardIterator __first, _DifferenceType __n, _Predicate __pred) noexcept {
+  return __first + __omp_gpu_backend::__omp_parallel_for_min_idx(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __pred);
+}
+
+}
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI _ForwardIterator
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
 __pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
-  // TODO: Implement the GPU backend
-  return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
+    return __par_backend::__parallel_for_min_idx(__first, __last - __first, __pred);
+  } else {
+    return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+  }
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index 30c2e5cca2f264..444352b4a5e51a 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -14,6 +14,8 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
+#include <optional>
+#include <__utility/empty.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -23,19 +25,44 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+template <class _Tp, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
+    _Tp* __first, _DifferenceType __n, _Function __f) noexcept {
+  __omp_gpu_backend::__omp_map_to(__first, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(*(__first+__i));
+  __omp_gpu_backend::__omp_map_from(__first, __n);
+  return __empty{};
+}
+
+// Extracting the underlying pointer
+
+template <class _ForwardIterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd_1(_ForwardIterator __first, _DifferenceType __n, _Function __f) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
+}
+
+}
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
-_LIBCPP_HIDE_FROM_ABI void
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
   // It is only safe to execute for_each on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+    return std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
   // Else we fall back to the serial backend
   else {
-    std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
+    return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
   }
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
index 666c3a0e000193..8fff9125add191 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -27,7 +28,7 @@ template <class _ExecutionPolicy,
           class _ForwardIterator2,
           class _ForwardOutIterator,
           class _Comp>
-_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_merge(
     __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index ee0bfe13bac901..77afb35ee71b4b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 
 #include <__assert>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/iterator_traits.h>
@@ -19,7 +20,9 @@
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
 #include <__utility/move.h>
+#include <__utility/empty.h>
 #include <cstddef>
+#include <optional>
 
 // is_same
 
@@ -39,13 +42,15 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
+//===----------------------------------------------------------------------===//
 // Functions for eaxtracting the pase pointers
+//===----------------------------------------------------------------------===//
 
 // In the general case we do not need to extract it. This is for instance the
 // case for pointers.
 template <typename _Tp>
 _LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) noexcept {
-  return p;
+  return std::__unwrap_iter(p);
 }
 
 // For vectors and arrays, etc, we need to extract the underlying base pointer.
@@ -64,159 +69,29 @@ _LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w)
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_to([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
-  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
-#  pragma omp target enter data map(to : p[0 : len])
-  } else {
-#  pragma omp target enter data map(to : p)
-  }
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target enter data map(to : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_from([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
-  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
-#  pragma omp target exit data map(from : p[0 : len])
-  } else {
-#  pragma omp target exit data map(release : p)
-  }
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target exit data map(from : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_alloc([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
-  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
-#  pragma omp target enter data map(alloc : p[0 : len])
-  } else {
-#  pragma omp target enter data map(to : p)
-  }
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target enter data map(alloc : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
-  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
-#  pragma omp target exit data map(release : p[0 : len])
-  } else {
-#  pragma omp target exit data map(release : p)
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Templates for one iterator
-//===----------------------------------------------------------------------===//
-
-// Applying function or lambda in a loop
-
-template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(
-    _Iterator __first, _DifferenceType __n, _Function __f, [[maybe_unused]] const int __device = 0) noexcept {
-  __omp_map_to(__first, __n);
-#  pragma omp target teams distribute parallel for simd device(__device)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(__first[__i]);
-  __omp_map_from(__first, __n);
-
-  return __first + __n;
-}
-
-// Extracting the underlying pointer
-
-template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-  __omp_parallel_for_simd_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
-  return __first + __n;
-}
-
-// Assigning a value in a loop
-
-template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
-    _Index __first, _DifferenceType __n, const _Tp& __value, [[maybe_unused]] const int __device = 0) noexcept {
-  __omp_map_alloc(__first, __n);
-#  pragma omp target teams distribute parallel for simd firstprivate(__value) device(__device)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __first[__i] = __value;
-  __omp_map_from(__first, __n);
-  return __first + __n;
-}
-
-template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index
-__parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
-  __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
-  return __first + __n;
-}
-
-//===----------------------------------------------------------------------===//
-// Templates for two iterators
-//===----------------------------------------------------------------------===//
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
-    _Iterator1 __first1,
-    _DifferenceType __n,
-    _Iterator2 __first2,
-    _Function __f,
-    [[maybe_unused]] const int __device = 0) noexcept {
-  __omp_map_alloc(__first2, __n);
-  __omp_map_to(__first1, __n);
-#  pragma omp target teams distribute parallel for simd device(__device)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first2 + __i) = __f(*(__first1 + __i));
-  __omp_map_from(__first2, __n);
-  __omp_map_free(__first1, __n);
-  return __first1 + __n;
-}
-
-// Extracting the underlying pointer
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator1
-__parallel_for_simd_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
-  __omp_parallel_for_simd_2(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __n,
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __f);
-  return __first1 + __n;
-}
-
-//===----------------------------------------------------------------------===//
-// Templates for three iterator
-//===----------------------------------------------------------------------===//
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
-    _Iterator1 __first1,
-    _DifferenceType __n,
-    _Iterator2 __first2,
-    _Iterator3 __first3,
-    _Function __f,
-    [[maybe_unused]] const int __device = 0) noexcept {
-  __omp_map_to(__first1, __n);
-  __omp_map_to(__first2, __n);
-  __omp_map_alloc(__first3, __n);
-#  pragma omp target teams distribute parallel for simd device(__device)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
-  __omp_map_free(__first1, __n);
-  __omp_map_free(__first2, __n);
-  __omp_map_from(__first3, __n);
-  return __first1 + __n;
-}
-
-// Extracting the underlying pointer
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
-    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
-  __omp_parallel_for_simd_3(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __n,
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __omp_gpu_backend::__omp_extract_base_ptr(__first3),
-      __f);
-  return __first1 + __n;
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target exit data map(release : p[0 : len])
 }
 
 //===----------------------------------------------------------------------===//
@@ -237,13 +112,12 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform,                                                                                   \
-        [[maybe_unused]] const int __device = 0) noexcept {                                                            \
-      __omp_map_to(__first, __n);                                                                                      \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) device(__device))                  \
+        _UnaryOperation __transform) noexcept {                                                                         \
+      __omp_gpu_backend::__omp_map_to(__first, __n);                                                                                      \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                    \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
-      __omp_map_free(__first, __n);                                                                                    \
+      __omp_gpu_backend::__omp_map_free(__first, __n);                                                                                    \
       return __init;                                                                                                   \
     }
 
@@ -260,15 +134,14 @@ _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__in
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform,                                                                                   \
-        [[maybe_unused]] const int __device = 0) noexcept {                                                            \
-      __omp_map_to(__first1, __n);                                                                                     \
-      __omp_map_to(__first2, __n);                                                                                     \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) device(__device))                  \
+        _UnaryOperation __transform) noexcept {                                                                         \
+      __omp_gpu_backend::__omp_map_to(__first1, __n);                                                                                     \
+      __omp_gpu_backend::__omp_map_to(__first2, __n);                                                                                     \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                    \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
-      __omp_map_free(__first1, __n);                                                                                   \
-      __omp_map_free(__first2, __n);                                                                                   \
+      __omp_gpu_backend::__omp_map_free(__first1, __n);                                                                                   \
+      __omp_gpu_backend::__omp_map_free(__first2, __n);                                                                                   \
       return __init;                                                                                                   \
     } // namespace __omp_gpu_backend
 
@@ -308,9 +181,8 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
     _DifferenceType __n,
     _Tp __init,
     _BinaryOperation __reduce,
-    _UnaryOperation __transform,
-    [[maybe_unused]] const int __device = 0) noexcept {
-  return __omp_parallel_for_simd_reduction_1(
+    _UnaryOperation __transform) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd_reduction_1(
       __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
 }
 
@@ -326,9 +198,8 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_2(
     _DifferenceType __n,
     _Tp __init,
     _BinaryOperation __reduce,
-    _UnaryOperation __transform,
-    [[maybe_unused]] const int __device = 0) noexcept {
-  return __omp_parallel_for_simd_reduction_2(
+    _UnaryOperation __transform) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd_reduction_2(
       __omp_gpu_backend::__omp_extract_base_ptr(__first1),
       __omp_gpu_backend::__omp_extract_base_ptr(__first2),
       __n,
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index 46e48dd279e89b..a4c6a2bff9f92f 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -13,6 +13,8 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/stable_sort.h>
 #include <__config>
+#include <__utility/empty.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -23,10 +25,10 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
-_LIBCPP_HIDE_FROM_ABI void
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_stable_sort(__omp_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+  return __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index e507a16eac965b..b29a479c17887c 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -15,6 +15,7 @@
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -24,18 +25,92 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+//===----------------------------------------------------------------------===//
+// Templates for two iterators
+//===----------------------------------------------------------------------===//
+
+template <class _Tp, class _DifferenceType, class _Up, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
+    _Tp* __first1,
+    _DifferenceType __n,
+    _Up* __first2,
+    _Function __f) noexcept {
+  __omp_gpu_backend::__omp_map_alloc(__first2, __n);
+  __omp_gpu_backend::__omp_map_to(__first1, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__first2 + __i) = __f(*(__first1 + __i));
+  __omp_gpu_backend::__omp_map_from(__first2, __n);
+  __omp_gpu_backend::__omp_map_free(__first1, __n);
+  return __empty{};
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__parallel_for_simd(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __n,
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __f);
+}
+
+//===----------------------------------------------------------------------===//
+// Templates for three iterator
+//===----------------------------------------------------------------------===//
+
+template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
+    _Tp* __first1,
+    _DifferenceType __n,
+    _Up* __first2,
+    _Vp* __first3,
+    _Function __f) noexcept {
+  __omp_gpu_backend::__omp_map_to(__first1, __n);
+  __omp_gpu_backend::__omp_map_to(__first2, __n);
+  __omp_gpu_backend::__omp_map_alloc(__first3, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
+  __omp_gpu_backend::__omp_map_free(__first1, __n);
+  __omp_gpu_backend::__omp_map_free(__first2, __n);
+  __omp_gpu_backend::__omp_map_from(__first3, __n);
+  return __empty{};
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd(
+    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __n,
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __omp_gpu_backend::__omp_extract_base_ptr(__first3),
+      __f);
+}
+
+}
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
-_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
     __omp_backend_tag,
     _ForwardIterator __first,
     _ForwardIterator __last,
     _ForwardOutIterator __result,
     _UnaryOperation __op) {
-  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
+    std::__par_backend::__parallel_for_simd(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
@@ -48,19 +123,19 @@ template <class _ExecutionPolicy,
           class _ForwardOutIterator,
           class _BinaryOperation,
           enable_if_t<is_execution_policy_v<__remove_cvref_t<_ExecutionPolicy>>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
     __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
     _ForwardIterator2 __first2,
     _ForwardOutIterator __result,
     _BinaryOperation __op) {
-  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
+    std::__par_backend::__parallel_for_simd(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index eb0a7ba52d8fdc..ddc3ba03f8e9b0 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -18,6 +18,7 @@
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/operation_traits.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -56,7 +57,7 @@ template <class _ExecutionPolicy,
           class _Tp,
           class _BinaryOperation1,
           class _BinaryOperation2>
-_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
+_LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
@@ -66,8 +67,10 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _BinaryOperation2 __transform) {
   // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && 
+                is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
@@ -81,16 +84,17 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
 //===----------------------------------------------------------------------===//
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
-_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
+_LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     __omp_backend_tag,
     _ForwardIterator __first,
     _ForwardIterator __last,
     _Tp __init,
     _BinaryOperation __reduce,
     _UnaryOperation __transform) {
-  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && 
+                is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);

>From ffab5cc3a83a96b9a7944d56f5ca65a9c6c418e7 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 16 Oct 2023 08:06:44 -0700
Subject: [PATCH 16/65] Clang formatted files

---
 .../__algorithm/pstl_backends/openmp/fill.h   | 13 ++++----
 .../pstl_backends/openmp/find_if.h            | 19 ++++++-----
 .../pstl_backends/openmp/for_each.h           | 15 +++++----
 .../pstl_backends/openmp/omp_offload.h        | 32 +++++++++----------
 .../pstl_backends/openmp/transform.h          | 19 ++++-------
 .../pstl_backends/openmp/transform_reduce.h   |  6 ++--
 6 files changed, 50 insertions(+), 54 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 250e514b4d526f..029b1877e309c1 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -29,12 +29,12 @@ namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
 template <class _Tp, class _DifferenceType, class _Up>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd_val(
-    _Tp* __first, _DifferenceType __n, const _Up& __value) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__omp_parallel_for_simd_val(_Tp* __first, _DifferenceType __n, const _Up& __value) noexcept {
   __omp_gpu_backend::__omp_map_alloc(__first, __n);
 #  pragma omp target teams distribute parallel for simd firstprivate(__value)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first+__i) = __value;
+    *(__first + __i) = __value;
   __omp_gpu_backend::__omp_map_from(__first, __n);
   return __empty{};
 }
@@ -42,11 +42,12 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd_val(
 template <class _ForwardIterator, class _DifferenceType, class _Tp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __parallel_for_simd_val(_ForwardIterator __first, _DifferenceType __n, const _Tp& __value) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd_val(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
+  return __omp_gpu_backend::__omp_parallel_for_simd_val(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
 }
 
-}
-}
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 94f7c42953c2a4..19b581a5f72b00 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -27,12 +27,13 @@ namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
 template <class _Tp, class _DifferenceType, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI _DifferenceType  __omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
+_LIBCPP_HIDE_FROM_ABI _DifferenceType
+__omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
   __omp_gpu_backend::__omp_map_to(__first, __n);
   _DifferenceType idx = __n;
-#  pragma omp target teams distribute parallel for simd reduction(min:idx)
-  for (_DifferenceType __i = 0; __i < __n; ++__i){
-    if (__pred(*(__first+__i))) {
+#  pragma omp target teams distribute parallel for simd reduction(min : idx)
+  for (_DifferenceType __i = 0; __i < __n; ++__i) {
+    if (__pred(*(__first + __i))) {
       idx = (__i < idx) ? __i : idx;
     }
   }
@@ -43,12 +44,14 @@ _LIBCPP_HIDE_FROM_ABI _DifferenceType  __omp_parallel_for_min_idx(_Tp* __first,
 // Extracting the underlying pointer
 
 template <class _ForwardIterator, class _DifferenceType, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator> __parallel_for_min_idx(_ForwardIterator __first, _DifferenceType __n, _Predicate __pred) noexcept {
-  return __first + __omp_gpu_backend::__omp_parallel_for_min_idx(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __pred);
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
+__parallel_for_min_idx(_ForwardIterator __first, _DifferenceType __n, _Predicate __pred) noexcept {
+  return __first +
+         __omp_gpu_backend::__omp_parallel_for_min_idx(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __pred);
 }
 
-}
-}
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index 444352b4a5e51a..eea337179b6576 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -14,8 +14,8 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
-#include <optional>
 #include <__utility/empty.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -29,12 +29,12 @@ namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
 template <class _Tp, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
-    _Tp* __first, _DifferenceType __n, _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__omp_parallel_for_simd(_Tp* __first, _DifferenceType __n, _Function __f) noexcept {
   __omp_gpu_backend::__omp_map_to(__first, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(*(__first+__i));
+    __f(*(__first + __i));
   __omp_gpu_backend::__omp_map_from(__first, __n);
   return __empty{};
 }
@@ -42,12 +42,13 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
 // Extracting the underlying pointer
 
 template <class _ForwardIterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd_1(_ForwardIterator __first, _DifferenceType __n, _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__parallel_for_simd_1(_ForwardIterator __first, _DifferenceType __n, _Function __f) noexcept {
   return __omp_gpu_backend::__omp_parallel_for_simd(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
 }
 
-}
-}
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 77afb35ee71b4b..183fdfa5cfa02a 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -9,8 +9,8 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 
-#include <__assert>
 #include <__algorithm/unwrap_iter.h>
+#include <__assert>
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/iterator_traits.h>
@@ -19,8 +19,8 @@
 #include <__memory/pointer_traits.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
-#include <__utility/move.h>
 #include <__utility/empty.h>
+#include <__utility/move.h>
 #include <cstddef>
 #include <optional>
 
@@ -70,28 +70,28 @@ template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_to([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#pragma omp target enter data map(to : p[0 : len])
+#  pragma omp target enter data map(to : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_from([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#pragma omp target exit data map(from : p[0 : len])
+#  pragma omp target exit data map(from : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_alloc([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#pragma omp target enter data map(alloc : p[0 : len])
+#  pragma omp target enter data map(alloc : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#pragma omp target exit data map(release : p[0 : len])
+#  pragma omp target exit data map(release : p[0 : len])
 }
 
 //===----------------------------------------------------------------------===//
@@ -112,12 +112,12 @@ __omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _Diffe
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform) noexcept {                                                                         \
-      __omp_gpu_backend::__omp_map_to(__first, __n);                                                                                      \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                    \
+        _UnaryOperation __transform) noexcept {                                                                        \
+      __omp_gpu_backend::__omp_map_to(__first, __n);                                                                   \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
-      __omp_gpu_backend::__omp_map_free(__first, __n);                                                                                    \
+      __omp_gpu_backend::__omp_map_free(__first, __n);                                                                 \
       return __init;                                                                                                   \
     }
 
@@ -134,14 +134,14 @@ _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__in
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform) noexcept {                                                                         \
-      __omp_gpu_backend::__omp_map_to(__first1, __n);                                                                                     \
-      __omp_gpu_backend::__omp_map_to(__first2, __n);                                                                                     \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                    \
+        _UnaryOperation __transform) noexcept {                                                                        \
+      __omp_gpu_backend::__omp_map_to(__first1, __n);                                                                  \
+      __omp_gpu_backend::__omp_map_to(__first2, __n);                                                                  \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
-      __omp_gpu_backend::__omp_map_free(__first1, __n);                                                                                   \
-      __omp_gpu_backend::__omp_map_free(__first2, __n);                                                                                   \
+      __omp_gpu_backend::__omp_map_free(__first1, __n);                                                                \
+      __omp_gpu_backend::__omp_map_free(__first2, __n);                                                                \
       return __init;                                                                                                   \
     } // namespace __omp_gpu_backend
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index b29a479c17887c..ba218dddd4be56 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -33,11 +33,8 @@ inline namespace __omp_gpu_backend {
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
-    _Tp* __first1,
-    _DifferenceType __n,
-    _Up* __first2,
-    _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__omp_parallel_for_simd(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Function __f) noexcept {
   __omp_gpu_backend::__omp_map_alloc(__first2, __n);
   __omp_gpu_backend::__omp_map_to(__first1, __n);
 #  pragma omp target teams distribute parallel for simd
@@ -65,12 +62,8 @@ __parallel_for_simd(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
-    _Tp* __first1,
-    _DifferenceType __n,
-    _Up* __first2,
-    _Vp* __first3,
-    _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__omp_parallel_for_simd(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Vp* __first3, _Function __f) noexcept {
   __omp_gpu_backend::__omp_map_to(__first1, __n);
   __omp_gpu_backend::__omp_map_to(__first2, __n);
   __omp_gpu_backend::__omp_map_alloc(__first3, __n);
@@ -96,8 +89,8 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd(
       __f);
 }
 
-}
-}
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index ddc3ba03f8e9b0..1427184d04d0c8 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -69,8 +69,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && 
-                is_arithmetic_v<_Tp> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
@@ -93,8 +92,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     _UnaryOperation __transform) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && 
-                is_arithmetic_v<_Tp> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);

>From 0c93bc5b77e5adf60e580eac4e501096c96c1171 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 19 Oct 2023 11:35:54 -0700
Subject: [PATCH 17/65] Fixing issued discussed during code review on the 16th
 of October

- Updated CMake logic
- Made `fill` call `transform` with a lambda returning a constant.
- Used `__rewrap_iter` and `__unwrap_iter`.
---
 libcxx/CMakeLists.txt                         |  13 +-
 .../pstl_backends/openmp/backend.h            |   8 +
 .../__algorithm/pstl_backends/openmp/fill.h   |  31 +---
 .../pstl_backends/openmp/find_if.h            |  27 +---
 .../pstl_backends/openmp/for_each.h           |  29 +---
 .../pstl_backends/openmp/omp_offload.h        | 144 ++----------------
 .../pstl_backends/openmp/transform.h          |  67 +++-----
 .../pstl_backends/openmp/transform_reduce.h   | 130 ++++++++++++----
 8 files changed, 163 insertions(+), 286 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index e8234d47d024bd..ca78038ab359d9 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -295,15 +295,14 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API
   "Build libc++ with an externalized threading API.
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
 
-set(LIBCXX_PSTL_BACKEND "openmp" CACHE INTERNAL "Which PSTL backend to use")
-if (LIBCXX_PSTL_BACKEND STREQUAL "")
-  if (LIBCXX_ENABLE_THREADS)
-    set(LIBCXX_PSTL_BACKEND "std-thread")
-  else()
-    set(LIBCXX_PSTL_BACKEND "serial")
-  endif()
+if (LIBCXX_ENABLE_THREADS)
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "std-thread")
+else()
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "serial")
 endif()
 
+set(LIBCXX_PSTL_BACKEND "${LIBCXX_PSTL_BACKEND_DEFAULT}" CACHE STRING "Select the PSTL backend to use. Valid values are serial, std-thread, libdispatch, openmp. Default: ${LIBCXX_PSTL_BACKEND_DEFAULT}")
+
 # Misc options ----------------------------------------------------------------
 # FIXME: Turn -pedantic back ON. It is currently off because it warns
 # about #include_next which is used everywhere.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index e4e6136082a342..9396f91b1a755f 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -20,6 +20,14 @@
 
 #if _LIBCPP_STD_VER >= 17
 
+#if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+# if !defined(_OPENMP)
+//#   warning "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+# elif (defined(_OPENMP) && _OPENMP < 201511)
+//#   warning "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
+# endif
+#endif
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 struct __omp_backend_tag {};
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 029b1877e309c1..e8101233e342a3 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -13,7 +13,9 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
+#include <__type_traits/remove_pointer.h>
 #include <__utility/empty.h>
 #include <optional>
 
@@ -25,30 +27,6 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-namespace __par_backend {
-inline namespace __omp_gpu_backend {
-
-template <class _Tp, class _DifferenceType, class _Up>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_parallel_for_simd_val(_Tp* __first, _DifferenceType __n, const _Up& __value) noexcept {
-  __omp_gpu_backend::__omp_map_alloc(__first, __n);
-#  pragma omp target teams distribute parallel for simd firstprivate(__value)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first + __i) = __value;
-  __omp_gpu_backend::__omp_map_from(__first, __n);
-  return __empty{};
-}
-
-template <class _ForwardIterator, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for_simd_val(_ForwardIterator __first, _DifferenceType __n, const _Tp& __value) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd_val(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
-}
-
-} // namespace __omp_gpu_backend
-} // namespace __par_backend
-
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
@@ -58,7 +36,10 @@ __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return std::__par_backend::__parallel_for_simd_val(__first, __last - __first, __value);
+    std::__rewrap_iter(__first, __par_backend::__omp_for_each(std::__unwrap_iter(__first), __last - __first, [&](std::remove_pointer_t<decltype(std::__unwrap_iter(__first))>& e) {
+          e = __value;
+        }));
+    return __empty{};
   }
   // Otherwise, we execute fill on the CPU instead
   else {
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 19b581a5f72b00..f07965885c9c68 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <__iterator/wrap_iter.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -23,13 +24,9 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-namespace __par_backend {
-inline namespace __omp_gpu_backend {
-
 template <class _Tp, class _DifferenceType, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI _DifferenceType
-__omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
-  __omp_gpu_backend::__omp_map_to(__first, __n);
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_find_if(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
+  __par_backend::__omp_map_to(__first, __n);
   _DifferenceType idx = __n;
 #  pragma omp target teams distribute parallel for simd reduction(min : idx)
   for (_DifferenceType __i = 0; __i < __n; ++__i) {
@@ -37,29 +34,17 @@ __omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred)
       idx = (__i < idx) ? __i : idx;
     }
   }
-  __omp_gpu_backend::__omp_map_free(__first, __n);
-  return idx;
-}
-
-// Extracting the underlying pointer
-
-template <class _ForwardIterator, class _DifferenceType, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
-__parallel_for_min_idx(_ForwardIterator __first, _DifferenceType __n, _Predicate __pred) noexcept {
-  return __first +
-         __omp_gpu_backend::__omp_parallel_for_min_idx(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __pred);
+  __par_backend::__omp_map_free(__first, __n);
+  return __first + idx;
 }
 
-} // namespace __omp_gpu_backend
-} // namespace __par_backend
-
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
 __pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return __par_backend::__parallel_for_min_idx(__first, __last - __first, __pred);
+    return std::__rewrap_iter(__first, std::__omp_find_if(std::__unwrap_iter(__first), __last - __first, __pred));
   } else {
     return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index eea337179b6576..eb8caf23971cb3 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/empty.h>
 #include <optional>
@@ -25,31 +26,6 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-namespace __par_backend {
-inline namespace __omp_gpu_backend {
-
-template <class _Tp, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_parallel_for_simd(_Tp* __first, _DifferenceType __n, _Function __f) noexcept {
-  __omp_gpu_backend::__omp_map_to(__first, __n);
-#  pragma omp target teams distribute parallel for simd
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(*(__first + __i));
-  __omp_gpu_backend::__omp_map_from(__first, __n);
-  return __empty{};
-}
-
-// Extracting the underlying pointer
-
-template <class _ForwardIterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for_simd_1(_ForwardIterator __first, _DifferenceType __n, _Function __f) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
-}
-
-} // namespace __omp_gpu_backend
-} // namespace __par_backend
-
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
@@ -59,7 +35,8 @@ __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+    __par_backend::__omp_for_each(std::__unwrap_iter(__first), __last - __first, __func);
+    return __empty{};
   }
   // Else we fall back to the serial backend
   else {
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 183fdfa5cfa02a..2e44af3c9a761f 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -22,7 +22,6 @@
 #include <__utility/empty.h>
 #include <__utility/move.h>
 #include <cstddef>
-#include <optional>
 
 // is_same
 
@@ -43,27 +42,9 @@ namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
 //===----------------------------------------------------------------------===//
-// Functions for eaxtracting the pase pointers
-//===----------------------------------------------------------------------===//
-
-// In the general case we do not need to extract it. This is for instance the
-// case for pointers.
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) noexcept {
-  return std::__unwrap_iter(p);
-}
-
-// For vectors and arrays, etc, we need to extract the underlying base pointer.
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w) noexcept {
-  std::pointer_traits<std::__wrap_iter<_Tp>> PT;
-  return PT.to_address(w);
-}
-
-//===----------------------------------------------------------------------===//
-// The following four functions differentiates between contiguous iterators and
-// non-contiguous iterators. That allows to use the same implementations for
-// reference and value iterators
+// The following four functions can be used to map contiguous array sections to
+// and from the device. For now, they are simple overlays of the OpenMP pragmas,
+// but they should be updated wen adding support for other iterator types.
 //===----------------------------------------------------------------------===//
 
 template <class _Iterator, class _DifferenceType>
@@ -95,117 +76,18 @@ __omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _Diffe
 }
 
 //===----------------------------------------------------------------------===//
-// Templates for reductions
+// The OpenMP implementation of for_each is shared between for_each and fill
 //===----------------------------------------------------------------------===//
 
-// In the two following function templates, we map the pointer to the device in
-// different ways depending on if they are contiguou or not.
-
-#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                  \
-    template <class _Iterator,                                                                                         \
-              class _DifferenceType,                                                                                   \
-              typename _Tp,                                                                                            \
-              typename _BinaryOperationType,                                                                           \
-              typename _UnaryOperation>                                                                                \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                     \
-        _Iterator __first,                                                                                             \
-        _DifferenceType __n,                                                                                           \
-        _Tp __init,                                                                                                    \
-        std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform) noexcept {                                                                        \
-      __omp_gpu_backend::__omp_map_to(__first, __n);                                                                   \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
-        __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
-      __omp_gpu_backend::__omp_map_free(__first, __n);                                                                 \
-      return __init;                                                                                                   \
-    }
-
-#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                  \
-    template <class _Iterator1,                                                                                        \
-              class _Iterator2,                                                                                        \
-              class _DifferenceType,                                                                                   \
-              typename _Tp,                                                                                            \
-              typename _BinaryOperationType,                                                                           \
-              typename _UnaryOperation >                                                                               \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                     \
-        _Iterator1 __first1,                                                                                           \
-        _Iterator2 __first2,                                                                                           \
-        _DifferenceType __n,                                                                                           \
-        _Tp __init,                                                                                                    \
-        std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform) noexcept {                                                                        \
-      __omp_gpu_backend::__omp_map_to(__first1, __n);                                                                  \
-      __omp_gpu_backend::__omp_map_to(__first2, __n);                                                                  \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
-        __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
-      __omp_gpu_backend::__omp_map_free(__first1, __n);                                                                \
-      __omp_gpu_backend::__omp_map_free(__first2, __n);                                                                \
-      return __init;                                                                                                   \
-    } // namespace __omp_gpu_backend
-
-#  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
-    __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                        \
-    __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
-
-// Addition
-__PSTL_OMP_SIMD_REDUCTION(+, std::plus)
-
-// Subtraction
-__PSTL_OMP_SIMD_REDUCTION(-, std::minus)
-
-// Multiplication
-__PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
-
-// Logical and
-__PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
-
-// Logical or
-__PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
-
-// Bitwise and
-__PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
-
-// Bitwise or
-__PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
-
-// Bitwise xor
-__PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
-
-// Extracting the underlying pointers
-
-template <class _Iterator, class _DifferenceType, typename _Tp, typename _BinaryOperation, typename _UnaryOperation >
-_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
-    _Iterator __first,
-    _DifferenceType __n,
-    _Tp __init,
-    _BinaryOperation __reduce,
-    _UnaryOperation __transform) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd_reduction_1(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
-}
-
-template <class _Iterator1,
-          class _Iterator2,
-          class _DifferenceType,
-          typename _Tp,
-          typename _BinaryOperation,
-          typename _UnaryOperation >
-_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_2(
-    _Iterator1 __first1,
-    _Iterator2 __first2,
-    _DifferenceType __n,
-    _Tp __init,
-    _BinaryOperation __reduce,
-    _UnaryOperation __transform) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd_reduction_2(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __n,
-      __init,
-      __reduce,
-      __transform);
+template <class _Tp, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp*
+__omp_for_each(_Tp* __first, _DifferenceType __n, _Function __f) noexcept {
+  __par_backend::__omp_map_to(__first, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(*(__first + __i));
+  __par_backend::__omp_map_from(__first, __n);
+  return __first + __n;
 }
 
 } // namespace __omp_gpu_backend
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index ba218dddd4be56..f7e52d7933b6c4 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -14,6 +14,7 @@
 #include <__algorithm/transform.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
 #include <optional>
 
@@ -25,73 +26,39 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-namespace __par_backend {
-inline namespace __omp_gpu_backend {
-
 //===----------------------------------------------------------------------===//
-// Templates for two iterators
+// OpenMP implementations of transform for one and two input iterators and one
+// output iterator
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_parallel_for_simd(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Function __f) noexcept {
-  __omp_gpu_backend::__omp_map_alloc(__first2, __n);
-  __omp_gpu_backend::__omp_map_to(__first1, __n);
+__omp_transform(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Function __f) noexcept {
+  __par_backend::__omp_map_alloc(__first2, __n);
+  __par_backend::__omp_map_to(__first1, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     *(__first2 + __i) = __f(*(__first1 + __i));
-  __omp_gpu_backend::__omp_map_from(__first2, __n);
-  __omp_gpu_backend::__omp_map_free(__first1, __n);
+  __par_backend::__omp_map_from(__first2, __n);
+  __par_backend::__omp_map_free(__first1, __n);
   return __empty{};
 }
 
-// Extracting the underlying pointer
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for_simd(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __n,
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __f);
-}
-
-//===----------------------------------------------------------------------===//
-// Templates for three iterator
-//===----------------------------------------------------------------------===//
-
 template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_parallel_for_simd(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Vp* __first3, _Function __f) noexcept {
-  __omp_gpu_backend::__omp_map_to(__first1, __n);
-  __omp_gpu_backend::__omp_map_to(__first2, __n);
-  __omp_gpu_backend::__omp_map_alloc(__first3, __n);
+__omp_transform(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Vp* __first3, _Function __f) noexcept {
+  __par_backend::__omp_map_to(__first1, __n);
+  __par_backend::__omp_map_to(__first2, __n);
+  __par_backend::__omp_map_alloc(__first3, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
-  __omp_gpu_backend::__omp_map_free(__first1, __n);
-  __omp_gpu_backend::__omp_map_free(__first2, __n);
-  __omp_gpu_backend::__omp_map_from(__first3, __n);
+  __par_backend::__omp_map_free(__first1, __n);
+  __par_backend::__omp_map_free(__first2, __n);
+  __par_backend::__omp_map_from(__first3, __n);
   return __empty{};
 }
 
-// Extracting the underlying pointer
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd(
-    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __n,
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __omp_gpu_backend::__omp_extract_base_ptr(__first3),
-      __f);
-}
-
-} // namespace __omp_gpu_backend
-} // namespace __par_backend
-
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
     __omp_backend_tag,
@@ -103,7 +70,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__par_backend::__parallel_for_simd(__first, __last - __first, __result, __op);
+    std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op);
     return __result + (__last - __first);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
@@ -128,7 +95,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__par_backend::__parallel_for_simd(__first1, __last1 - __first1, __first2, __result, __op);
+    std::__omp_transform(std::__unwrap_iter(__first1), __last1 - __first1, std::__unwrap_iter(__first2), std::__unwrap_iter(__result), __op);
     return __result + (__last1 - __first1);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index 1427184d04d0c8..3798e1cb3f3fac 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__iterator/wrap_iter.h>
 #include <__numeric/transform_reduce.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_arithmetic.h>
@@ -28,6 +29,87 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+//===----------------------------------------------------------------------===//
+// Templates for predefined reductions
+//===----------------------------------------------------------------------===//
+
+#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                  \
+    template <class _Iterator,                                                                                         \
+              class _DifferenceType,                                                                                   \
+              typename _Tp,                                                                                            \
+              typename _BinaryOperationType,                                                                           \
+              typename _UnaryOperation>                                                                                \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                     \
+        _Iterator __first,                                                                                             \
+        _DifferenceType __n,                                                                                           \
+        _Tp __init,                                                                                                    \
+        std_op<_BinaryOperationType> __reduce,                                                                         \
+        _UnaryOperation __transform) noexcept {                                                                        \
+      __par_backend::__omp_map_to(__first, __n);                                                                   \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
+        __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
+      __par_backend::__omp_map_free(__first, __n);                                                                 \
+      return __init;                                                                                                   \
+    }
+
+#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                  \
+    template <class _Iterator1,                                                                                        \
+              class _Iterator2,                                                                                        \
+              class _DifferenceType,                                                                                   \
+              typename _Tp,                                                                                            \
+              typename _BinaryOperationType,                                                                           \
+              typename _UnaryOperation >                                                                               \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                     \
+        _Iterator1 __first1,                                                                                           \
+        _Iterator2 __first2,                                                                                           \
+        _DifferenceType __n,                                                                                           \
+        _Tp __init,                                                                                                    \
+        std_op<_BinaryOperationType> __reduce,                                                                         \
+        _UnaryOperation __transform) noexcept {                                                                        \
+      __par_backend::__omp_map_to(__first1, __n);                                                                  \
+      __par_backend::__omp_map_to(__first2, __n);                                                                  \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
+        __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
+      __par_backend::__omp_map_free(__first1, __n);                                                                \
+      __par_backend::__omp_map_free(__first2, __n);                                                                \
+      return __init;                                                                                                   \
+    }
+
+#  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
+    __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                        \
+    __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
+
+// Addition
+__PSTL_OMP_SIMD_REDUCTION(+, std::plus)
+
+// Subtraction
+__PSTL_OMP_SIMD_REDUCTION(-, std::minus)
+
+// Multiplication
+__PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
+
+// Logical and
+__PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
+
+// Logical or
+__PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
+
+// Bitwise and
+__PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
+
+// Bitwise or
+__PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
+
+// Bitwise xor
+__PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
+
+//===----------------------------------------------------------------------===//
+// The following struct is used to determine whether a reduction is supported by
+// the OpenMP backend.
+//===----------------------------------------------------------------------===//
+
 template <class _T1, class _T2, class _T3>
 struct __is_supported_reduction : std::false_type {};
 
@@ -48,9 +130,28 @@ __PSTL_IS_SUPPORTED_REDUCTION(bit_or)
 __PSTL_IS_SUPPORTED_REDUCTION(bit_xor)
 
 //===----------------------------------------------------------------------===//
-// Two input iterators
+// Implementation of PSTL transform_reduce for one and two input iterators
 //===----------------------------------------------------------------------===//
 
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
+_LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
+    __omp_backend_tag,
+    _ForwardIterator __first,
+    _ForwardIterator __last,
+    _Tp __init,
+    _BinaryOperation __reduce,
+    _UnaryOperation __transform) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+                __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
+    return std::__omp_transform_reduce(
+        std::__unwrap_iter(__first), __last - __first, __init, __reduce, __transform);
+  }
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
+}
+
 template <class _ExecutionPolicy,
           class _ForwardIterator1,
           class _ForwardIterator2,
@@ -71,36 +172,13 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value) {
-    return std::__par_backend::__parallel_for_simd_reduction_2(
-        __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
+    return std::__omp_transform_reduce(
+        std::__unwrap_iter(__first1), std::__unwrap_iter(__first2), __last1 - __first1, __init, __reduce, __transform);
   }
   return std::__pstl_transform_reduce<_ExecutionPolicy>(
       __cpu_backend_tag{}, __first1, __last1, __first2, std::move(__init), __reduce, __transform);
 }
 
-//===----------------------------------------------------------------------===//
-// One input iterator
-//===----------------------------------------------------------------------===//
-
-template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
-_LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
-    __omp_backend_tag,
-    _ForwardIterator __first,
-    _ForwardIterator __last,
-    _Tp __init,
-    _BinaryOperation __reduce,
-    _UnaryOperation __transform) {
-  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
-                __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
-    return std::__par_backend::__parallel_for_simd_reduction_1(
-        __first, __last - __first, __init, __reduce, __transform);
-  }
-  return std::__pstl_transform_reduce<_ExecutionPolicy>(
-      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
-}
-
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17

>From 8de2ec891186d378eadd06b611d05e30e5261ae2 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 19 Oct 2023 17:55:42 -0700
Subject: [PATCH 18/65] Added OpenMP offloading documentation to libcxx

---
 libcxx/docs/BuildingLibcxx.rst |  11 ++++
 libcxx/docs/UsingLibcxx.rst    | 107 +++++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+)

diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst
index 2cee97c03ced08..cbd6183cbd3566 100644
--- a/libcxx/docs/BuildingLibcxx.rst
+++ b/libcxx/docs/BuildingLibcxx.rst
@@ -408,6 +408,17 @@ libc++ Feature Options
   Use the specified GCC toolchain and standard library when building the native
   stdlib benchmark tests.
 
+.. option:: LIBCXX_PSTL_BACKEND:STRING
+
+  **Default**:: ``"serial"``
+
+  **Values**:: ``serial``, ``std-thread``, ``libdispatch``, ``openmp``
+
+  Select the desired backend for C++ parallel algorithms. All four options can
+  target multi-core CPU architectures, and ``openmp`` can additionally target
+  GPU architectures. The ``openmp`` backend requires OpenMP version 4.5 or
+  later.
+
 
 libc++ ABI Feature Options
 --------------------------
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index 52c76f3b10548f..9ef4814d00254f 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -466,6 +466,113 @@ Unpoisoning may not be an option, if (for example) you are not maintaining the a
 * You are using allocator, which does not call destructor during deallocation.
 * You are aware that memory allocated with an allocator may be accessed, even when unused by container.
 
+Offloading C++ Parallel Algorithms to GPUs
+------------------------------------------
+
+Experimental support for GPU offloading has been added to ``libc++``. The
+implementation uses OpenMP target offloading to leverage GPU compute resources.
+The OpenMP PSTL backend can target both NVIDIA and AMD GPUs.
+However, the implementation only supports contiguous iterators, such as 
+iterators for ``std::vector`` or ``std::array``.
+To enable the OpenMP offloading backend it must be selected with
+``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when
+compiling a program, the user must specify the command line options
+``-fopenmp -fexperimental-library -stdlib=libc++``. To install LLVM with OpenMP
+offloading enabled, please read
+`the LLVM OpenMP FAQ. <https://openmp.llvm.org/SupportAndFAQ.html>`_ 
+You may also want to to visit
+`the OpenMP offloading command-line argument reference. <https://openmp.llvm.org/CommandLineArgumentReference.html#offload-command-line-arguments>`_ 
+
+Example
+~~~~~~~
+
+The following is an example of offloading vector addition to a GPU using our
+standard library extension.
+
+.. code-block:: cpp
+
+  #include <algorithm>
+  #include <execution>
+
+  template<typename T1, typename T2, typename T3>
+  void axpy(const T1 a,std::vector<T2>& x, std::vector<T3>& y)
+  {
+    std::transform(std::execution::par_unseq,x.begin(),x.end(), y.begin(), y.begin(),
+                  [=](T2 xi, T3 yi){ return a*xi + yi; });
+  }
+
+The execution policy ``std::execution::par_unseq`` states that the algorithm's
+execution may be parallelized, vectorized, and migrated across threads. This is
+the only execution mode that is safe to offload to GPUs, and for all other
+execution modes the algorithms will execute on the CPU.
+Special attention must be paid to the lambda captures when enabling GPU
+offloading. If the lambda captures by reference, the user must manually map the
+variables to the device. If capturing by reference, the above example could
+be implemented in the following way.
+
+.. code-block:: cpp
+
+  template<typename T1, typename T2, typename T3>
+  void axpy(const T1 a,std::vector<T2>& x, std::vector<T3>& y)
+  {
+  # pragma omp target data map(to:a)
+    std::transform(std::execution::par_unseq,x.begin(),x.end(), y.begin(), y.begin(),
+                  [&](T2 xi, T3 yi){ return a*xi + yi; });
+  }
+
+However, if unified shared memory, USM, is enabled, no additional data mapping
+is necessary when capturing y reference.
+
+Compiling functions for GPUs with OpenMP
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The C++ standard defines that all accesses to memory are inside a single address
+space. However, discrete GPU systems have distinct address spaces. A single
+address space can be emulated if your system supports unified shared memory.
+However, many discrete GPU systems do not, and in those cases it is important to
+pass device function pointers to the parallel algorithms. Below is an example of
+how the OpenMP `declare target` directive can be used to mark that a function
+should be compiled for both host and device. The device address of a function
+pointer can be obtained with `target map(from:<list of identifiers>)`.
+
+.. code-block:: cpp
+
+  // Declare that the function must be compiled for both host and device
+  #pragma omp declare target
+  void cube(int& n) {n*=n*n; };
+  #pragma omp end declare target
+
+  int main()
+  {
+    int * a =  new int[LEN];
+    // Initialize the array to 2 on the device
+    std::fill(std::execution::par_unseq,a, a+LEN,2);
+    // Get the device pointer for cube
+    void (*dcube)(int& n);
+    #pragma omp target map(from:dcube)
+    dcube = &cube;
+    // Pass the device function pointer to the parallel algorithm
+    std::for_each(std::execution::par_unseq,a, a+LEN,dcube);
+    // Validate that the result is 8 on the host for all array indices
+    std::for_each(std::execution::par,a, a+LEN,[&](int & n){
+      assert(n == 8);
+    });
+    delete[] a;
+    return 0;
+  }
+
+Without unified shared memory, the above example will not work if the host
+function pointer `cube` is passed to the parallel algorithm.
+
+Important notes about exception handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+GPU architectures do not support exception handling. If compiling a program
+containing parallel algorithms with ``clang`` 18 or newer, a program with
+exceptions in offloaded code regions will compile, but the program will
+terminate if an exception is thrown on the device. This does not conform with
+the C++ standard and exception handling on GPUs will hopefully be better
+supported in future releases of LLVM.
+
 Platform specific behavior
 ==========================
 

>From 2910feb320bb89408918da4e76bc22fafb510b3e Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 11:26:26 -0700
Subject: [PATCH 19/65] Changed argument names to reflect input and output

---
 .../pstl_backends/openmp/backend.h            | 15 +++---
 .../__algorithm/pstl_backends/openmp/fill.h   | 20 +++++---
 .../pstl_backends/openmp/find_if.h            |  5 +-
 .../pstl_backends/openmp/for_each.h           | 17 +++++--
 .../pstl_backends/openmp/omp_offload.h        | 21 +-------
 .../pstl_backends/openmp/transform.h          | 51 +++++++++++++------
 .../pstl_backends/openmp/transform_reduce.h   | 20 ++++----
 7 files changed, 85 insertions(+), 64 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index 9396f91b1a755f..99245a7d13e991 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -20,13 +20,14 @@
 
 #if _LIBCPP_STD_VER >= 17
 
-#if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
-# if !defined(_OPENMP)
-//#   warning "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
-# elif (defined(_OPENMP) && _OPENMP < 201511)
-//#   warning "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
-# endif
-#endif
+#  if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+#    if !defined(_OPENMP)
+// #   warning "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+#    elif (defined(_OPENMP) && _OPENMP < 201511)
+// #   warning "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent
+// version of OpenMP."
+#    endif
+#  endif
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index e8101233e342a3..4262f778fa9121 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -27,18 +27,26 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _Tp, class _DifferenceType, class _Up>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_fill(_Tp* __out1, _DifferenceType __n, const _Up& __value) noexcept {
+  __par_backend::__omp_map_alloc(__out1, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__out1 + __i) = __value;
+  __par_backend::__omp_map_from(__out1, __n);
+  return __out1 + __n;
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
-  // It is only safe to execute fill on the GPU, it the execution policy is
-  // parallel unsequenced, as it is the only execution policy allowing
-  // SIMD instructions
+  // If it is safe to offload the computations to the GPU, we call the OpenMP
+  // implementation of for_each. In the case of fill, we provide for_Each with a
+  // lambda returning a constant.
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__rewrap_iter(__first, __par_backend::__omp_for_each(std::__unwrap_iter(__first), __last - __first, [&](std::remove_pointer_t<decltype(std::__unwrap_iter(__first))>& e) {
-          e = __value;
-        }));
+    std::__rewrap_iter(__first, std::__omp_fill(std::__unwrap_iter(__first), __last - __first, __value));
     return __empty{};
   }
   // Otherwise, we execute fill on the CPU instead
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index f07965885c9c68..8d7b196ec176f7 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -34,17 +34,20 @@ _LIBCPP_HIDE_FROM_ABI _Tp* __omp_find_if(_Tp* __first, _DifferenceType __n, _Pre
       idx = (__i < idx) ? __i : idx;
     }
   }
-  __par_backend::__omp_map_free(__first, __n);
+  __par_backend::__omp_map_release(__first, __n);
   return __first + idx;
 }
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
 __pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+  // If it is safe to offload the computations to the GPU, we call the OpenMP
+  // implementation of find_if
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     return std::__rewrap_iter(__first, std::__omp_find_if(std::__unwrap_iter(__first), __last - __first, __pred));
+    // Else we rey on the CPU PSTL backend
   } else {
     return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index eb8caf23971cb3..f64efc170537af 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -26,16 +26,25 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _Tp, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_for_each(_Tp* __inout1, _DifferenceType __n, _Function __f) noexcept {
+  __par_backend::__omp_map_to(__inout1, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(*(__inout1 + __i));
+  __par_backend::__omp_map_from(__inout1, __n);
+  return __inout1 + __n;
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
-  // It is only safe to execute for_each on the GPU, it the execution policy is
-  // parallel unsequenced, as it is the only execution policy prohibiting throwing
-  // exceptions and allowing SIMD instructions
+  // If it is safe to offload the computations to the GPU, we call the OpenMP
+  // implementation of for_each
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    __par_backend::__omp_for_each(std::__unwrap_iter(__first), __last - __first, __func);
+    std::__omp_for_each(std::__unwrap_iter(__first), __last - __first, __func);
     return __empty{};
   }
   // Else we fall back to the serial backend
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 2e44af3c9a761f..88ee4cdbbc4fae 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -23,10 +23,6 @@
 #include <__utility/move.h>
 #include <cstddef>
 
-// is_same
-
-// __libcpp_is_contiguous_iterator
-
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
@@ -70,26 +66,11 @@ __omp_map_alloc([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _Diff
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
-__omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+__omp_map_release([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
 #  pragma omp target exit data map(release : p[0 : len])
 }
 
-//===----------------------------------------------------------------------===//
-// The OpenMP implementation of for_each is shared between for_each and fill
-//===----------------------------------------------------------------------===//
-
-template <class _Tp, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Tp*
-__omp_for_each(_Tp* __first, _DifferenceType __n, _Function __f) noexcept {
-  __par_backend::__omp_map_to(__first, __n);
-#  pragma omp target teams distribute parallel for simd
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(*(__first + __i));
-  __par_backend::__omp_map_from(__first, __n);
-  return __first + __n;
-}
-
 } // namespace __omp_gpu_backend
 } // namespace __par_backend
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index f7e52d7933b6c4..bd795d5571f0d7 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -33,29 +33,45 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_transform(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Function __f) noexcept {
-  __par_backend::__omp_map_alloc(__first2, __n);
-  __par_backend::__omp_map_to(__first1, __n);
+__omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
+  // The order of the following maps matter, as we wish to move the data. If
+  // they were placed in the reverse order, and __in equals __out, then we would
+  // allocate the buffer on the device without copying the data.
+  __par_backend::__omp_map_to(__in1, __n);
+  __par_backend::__omp_map_alloc(__out1, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first2 + __i) = __f(*(__first1 + __i));
-  __par_backend::__omp_map_from(__first2, __n);
-  __par_backend::__omp_map_free(__first1, __n);
+    *(__out1 + __i) = __f(*(__in1 + __i));
+  // The order of the following two maps matters, since the user could legally
+  // overwrite __in The "release" map modifier decreases the reference counter
+  // by one, and "from" only moves the data to the host, when the reference
+  // count is decremented to zero.
+  __par_backend::__omp_map_release(__in1, __n);
+  __par_backend::__omp_map_from(__out1, __n);
   return __empty{};
 }
 
 template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_transform(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Vp* __first3, _Function __f) noexcept {
-  __par_backend::__omp_map_to(__first1, __n);
-  __par_backend::__omp_map_to(__first2, __n);
-  __par_backend::__omp_map_alloc(__first3, __n);
+__omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Function __f) noexcept {
+  // The order of the following maps matter, as we wish to move the data. If
+  // they were placed in the reverse order, and __out equals __in1 or __in2,
+  // then we would allocate one of the buffer on the device without copying the
+  // data.
+  __par_backend::__omp_map_to(__in1, __n);
+  __par_backend::__omp_map_to(__in2, __n);
+  __par_backend::__omp_map_alloc(__out1, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
-  __par_backend::__omp_map_free(__first1, __n);
-  __par_backend::__omp_map_free(__first2, __n);
-  __par_backend::__omp_map_from(__first3, __n);
+    *(__out1 + __i) = __f(*(__in1 + __i), *(__in2 + __i));
+  // The order of the following three maps matters, since the user could legally
+  // overwrite either of the inputs if __out equals __in1 or __in2. The
+  // "release" map modifier decreases the reference counter by one, and "from"
+  // only moves the data from the device, when the reference count is
+  // decremented to zero.
+  __par_backend::__omp_map_release(__in1, __n);
+  __par_backend::__omp_map_release(__in2, __n);
+  __par_backend::__omp_map_from(__out1, __n);
   return __empty{};
 }
 
@@ -95,7 +111,12 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__omp_transform(std::__unwrap_iter(__first1), __last1 - __first1, std::__unwrap_iter(__first2), std::__unwrap_iter(__result), __op);
+    std::__omp_transform(
+        std::__unwrap_iter(__first1),
+        __last1 - __first1,
+        std::__unwrap_iter(__first2),
+        std::__unwrap_iter(__result),
+        __op);
     return __result + (__last1 - __first1);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index 3798e1cb3f3fac..1684c3d273b223 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -39,17 +39,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
               typename _Tp,                                                                                            \
               typename _BinaryOperationType,                                                                           \
               typename _UnaryOperation>                                                                                \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                     \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                                  \
         _Iterator __first,                                                                                             \
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
         _UnaryOperation __transform) noexcept {                                                                        \
-      __par_backend::__omp_map_to(__first, __n);                                                                   \
+      __par_backend::__omp_map_to(__first, __n);                                                                       \
 _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
-      __par_backend::__omp_map_free(__first, __n);                                                                 \
+      __par_backend::__omp_map_release(__first, __n);                                                                  \
       return __init;                                                                                                   \
     }
 
@@ -60,20 +60,20 @@ _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__in
               typename _Tp,                                                                                            \
               typename _BinaryOperationType,                                                                           \
               typename _UnaryOperation >                                                                               \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                     \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                                  \
         _Iterator1 __first1,                                                                                           \
         _Iterator2 __first2,                                                                                           \
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
         _UnaryOperation __transform) noexcept {                                                                        \
-      __par_backend::__omp_map_to(__first1, __n);                                                                  \
-      __par_backend::__omp_map_to(__first2, __n);                                                                  \
+      __par_backend::__omp_map_to(__first1, __n);                                                                      \
+      __par_backend::__omp_map_to(__first2, __n);                                                                      \
 _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
-      __par_backend::__omp_map_free(__first1, __n);                                                                \
-      __par_backend::__omp_map_free(__first2, __n);                                                                \
+      __par_backend::__omp_map_release(__first1, __n);                                                                 \
+      __par_backend::__omp_map_release(__first2, __n);                                                                 \
       return __init;                                                                                                   \
     }
 
@@ -145,8 +145,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
-    return std::__omp_transform_reduce(
-        std::__unwrap_iter(__first), __last - __first, __init, __reduce, __transform);
+    return std::__omp_transform_reduce(std::__unwrap_iter(__first), __last - __first, __init, __reduce, __transform);
   }
   return std::__pstl_transform_reduce<_ExecutionPolicy>(
       __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
@@ -166,7 +165,6 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     _Tp __init,
     _BinaryOperation1 __reduce,
     _BinaryOperation2 __transform) {
-  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&

>From 2a697a62011527df93fa33b6d90cedade70195c9 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 11:42:13 -0700
Subject: [PATCH 20/65] Adding compile time errors to
 pstl_backend/openmp/backend.h to validate that OpenMP is enabled

---
 libcxx/docs/UsingLibcxx.rst                               | 1 +
 libcxx/include/__algorithm/pstl_backends/openmp/backend.h | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index 9ef4814d00254f..c9726ef4f60401 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -525,6 +525,7 @@ is necessary when capturing y reference.
 
 Compiling functions for GPUs with OpenMP
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 The C++ standard defines that all accesses to memory are inside a single address
 space. However, discrete GPU systems have distinct address spaces. A single
 address space can be emulated if your system supports unified shared memory.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index 99245a7d13e991..11ab0bb108b225 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -22,10 +22,9 @@
 
 #  if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 #    if !defined(_OPENMP)
-// #   warning "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+#   error "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
 #    elif (defined(_OPENMP) && _OPENMP < 201511)
-// #   warning "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent
-// version of OpenMP."
+#   error "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
 #    endif
 #  endif
 

>From 5500b03ea472c61b2ca76acddd879d19a2f4ec1a Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 11:58:27 -0700
Subject: [PATCH 21/65] Clang-formatted error message in backend.h

---
 libcxx/include/__algorithm/pstl_backends/openmp/backend.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index 11ab0bb108b225..eb5e40d6f20a94 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -22,9 +22,10 @@
 
 #  if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 #    if !defined(_OPENMP)
-#   error "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+#      error "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
 #    elif (defined(_OPENMP) && _OPENMP < 201511)
-#   error "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
+#      error                                                                                                           \
+          "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
 #    endif
 #  endif
 

>From 3c7b6fc2e88e5ed911e5ee8e471df5f14c6c839b Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 12:48:16 -0700
Subject: [PATCH 22/65] Disabling ADL in pstl_backends/openmp/stable_sort.h

---
 libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index a4c6a2bff9f92f..0b5ce39a2344a9 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -28,7 +28,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_stable_sort(__omp_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  return __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+  return std::__pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD

>From 8fe2440710ad6571146101cd55dfe06f1ebb578f Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 12:54:28 -0700
Subject: [PATCH 23/65] Added std::__rewrap_iter to std::transform

---
 .../__algorithm/pstl_backends/openmp/transform.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index bd795d5571f0d7..0338771728d9d0 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -32,7 +32,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
+_LIBCPP_HIDE_FROM_ABI _Tp*
 __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
   // The order of the following maps matter, as we wish to move the data. If
   // they were placed in the reverse order, and __in equals __out, then we would
@@ -48,11 +48,11 @@ __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noe
   // count is decremented to zero.
   __par_backend::__omp_map_release(__in1, __n);
   __par_backend::__omp_map_from(__out1, __n);
-  return __empty{};
+  return __out1 + __n;
 }
 
 template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
+_LIBCPP_HIDE_FROM_ABI _Tp*
 __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Function __f) noexcept {
   // The order of the following maps matter, as we wish to move the data. If
   // they were placed in the reverse order, and __out equals __in1 or __in2,
@@ -72,7 +72,7 @@ __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Funct
   __par_backend::__omp_map_release(__in1, __n);
   __par_backend::__omp_map_release(__in2, __n);
   __par_backend::__omp_map_from(__out1, __n);
-  return __empty{};
+  return __out1 + __n;
 }
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
@@ -86,8 +86,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op);
-    return __result + (__last - __first);
+    std::__rewrap_iter(__result,std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op));
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
@@ -111,13 +110,12 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__omp_transform(
+    return std::__rewrap_iter(__result,std::__omp_transform(
         std::__unwrap_iter(__first1),
         __last1 - __first1,
         std::__unwrap_iter(__first2),
         std::__unwrap_iter(__result),
-        __op);
-    return __result + (__last1 - __first1);
+        __op));
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);

>From 5e4811345619942be3dead4c7b6cf4a3b79d0f51 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 13:17:56 -0700
Subject: [PATCH 24/65] Clang-formatted
 libcxx/include/__algorithm/pstl_backends/openmp/transform.h

---
 .../pstl_backends/openmp/transform.h          | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index 0338771728d9d0..e1c5f54675b387 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -32,8 +32,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Tp*
-__omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
   // The order of the following maps matter, as we wish to move the data. If
   // they were placed in the reverse order, and __in equals __out, then we would
   // allocate the buffer on the device without copying the data.
@@ -86,7 +85,9 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__rewrap_iter(__result,std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op));
+    std::__rewrap_iter(
+        __result,
+        std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op));
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
@@ -110,12 +111,14 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    return std::__rewrap_iter(__result,std::__omp_transform(
-        std::__unwrap_iter(__first1),
-        __last1 - __first1,
-        std::__unwrap_iter(__first2),
-        std::__unwrap_iter(__result),
-        __op));
+    return std::__rewrap_iter(
+        __result,
+        std::__omp_transform(
+            std::__unwrap_iter(__first1),
+            __last1 - __first1,
+            std::__unwrap_iter(__first2),
+            std::__unwrap_iter(__result),
+            __op));
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);

>From 124c55bf309895ff08ca8d9ae9762d34c59e3472 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 14:40:03 -0700
Subject: [PATCH 25/65] Fixing errors detected by C++26 buildbot

---
 .../__algorithm/pstl_backends/openmp/find_if.h   |  8 ++++----
 .../pstl_backends/openmp/omp_offload.h           | 16 ++++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 8d7b196ec176f7..b86c6cbebf2201 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -27,15 +27,15 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp, class _DifferenceType, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI _Tp* __omp_find_if(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
   __par_backend::__omp_map_to(__first, __n);
-  _DifferenceType idx = __n;
-#  pragma omp target teams distribute parallel for simd reduction(min : idx)
+  _DifferenceType __idx = __n;
+#  pragma omp target teams distribute parallel for simd reduction(min : __idx)
   for (_DifferenceType __i = 0; __i < __n; ++__i) {
     if (__pred(*(__first + __i))) {
-      idx = (__i < idx) ? __i : idx;
+      __idx = (__i < __idx) ? __i : __idx;
     }
   }
   __par_backend::__omp_map_release(__first, __n);
-  return __first + idx;
+  return __first + __idx;
 }
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 88ee4cdbbc4fae..99ac28a7a33ead 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -45,30 +45,30 @@ inline namespace __omp_gpu_backend {
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
-__omp_map_to([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+__omp_map_to([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(to : p[0 : len])
+#  pragma omp target enter data map(to : __p [0:__len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
-__omp_map_from([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+__omp_map_from([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(from : p[0 : len])
+#  pragma omp target exit data map(from : __p [0:__len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
-__omp_map_alloc([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+__omp_map_alloc([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(alloc : p[0 : len])
+#  pragma omp target enter data map(alloc : __p [0:__len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
-__omp_map_release([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+__omp_map_release([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(release : p[0 : len])
+#  pragma omp target exit data map(release : __p [0:__len])
 }
 
 } // namespace __omp_gpu_backend

>From bf7a98d6a04a24220ea419e4be31cb1abb560cca Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 23 Oct 2023 11:05:33 -0700
Subject: [PATCH 26/65] Updating includes once more after refactorings

---
 .../__algorithm/pstl_backends/openmp.h        | 35 +++++++++++++++++++
 .../__algorithm/pstl_backends/openmp/fill.h   |  1 +
 .../pstl_backends/openmp/find_if.h            |  1 +
 .../pstl_backends/openmp/for_each.h           |  1 +
 .../pstl_backends/openmp/omp_offload.h        | 27 ++++----------
 .../pstl_backends/openmp/transform.h          |  1 +
 .../pstl_backends/openmp/transform_reduce.h   |  2 ++
 7 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index 9e5490db655f4b..a1d015e056837b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -11,6 +11,41 @@
 
 #include <__config>
 
+/*
+Combined OpenMP CPU and GPU Backend
+===================================
+Contrary to the CPU backends found in ./cpu_backends/, the OpenMP backend can
+target both CPUs and GPUs. The OpenMP standard defines that when offloading code
+to an accelerator, the compiler must generate a fallback code for execution on
+the host. Thereby, the backend works as a CPU backend if no targeted accelerator
+is available at execution time. The target regions can also be compiled directly
+for a CPU architecture, for instance by adding the command-line option
+`-fopenmp-targets=x86_64-pc-linux-gnu` in Clang.
+
+Implicit Assumptions
+--------------------
+If the user provides a function pointer as an argument to a parallel algorithm,
+it is assumed that it is the device pointer as there is currently no way to
+check whether a host or device pointer was passed.
+
+Mapping Clauses
+---------------
+In some of the parallel algorithms, the user is allowed to provide the same
+iterator as input and output. Hence, the order of the maps matters. Therefore,
+`pragma omp target data map(to:...)` must be used before
+`pragma omp target data map(alloc:...)`. Conversely, the maps with map modifier
+`release` must be placed before the maps with map modifier `from` when
+transferring the result from the device to the host.
+
+Exceptions
+----------
+Currently, GPU architectures do not handle exceptions. OpenMP target regions are
+allowed to contain try/catch statements and throw expressions in Clang, but if a
+throw expression is reached, it will terminate the program. That does not
+conform with the C++ standard.
+
+*/
+
 #include <__algorithm/pstl_backends/openmp/backend.h>
 
 #include <__algorithm/pstl_backends/openmp/any_of.h>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 4262f778fa9121..53de1284acc60b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -12,6 +12,7 @@
 #include <__algorithm/fill.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index b86c6cbebf2201..db128e058823db 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -12,6 +12,7 @@
 #include <__algorithm/find_if.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <optional>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index f64efc170537af..ef73b864773fc1 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -12,6 +12,7 @@
 #include <__algorithm/for_each.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 99ac28a7a33ead..d47fdcd39c39b2 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -9,33 +9,20 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 
-#include <__algorithm/unwrap_iter.h>
 #include <__assert>
 #include <__config>
-#include <__functional/operations.h>
 #include <__iterator/iterator_traits.h>
-#include <__iterator/wrap_iter.h>
-#include <__memory/addressof.h>
-#include <__memory/pointer_traits.h>
-#include <__type_traits/is_pointer.h>
-#include <__type_traits/is_same.h>
-#include <__utility/empty.h>
-#include <__utility/move.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
-_LIBCPP_PUSH_MACROS
-#include <__undef_macros>
-
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace __par_backend {
-inline namespace __omp_gpu_backend {
+inline namespace __omp_backend {
 
 //===----------------------------------------------------------------------===//
 // The following four functions can be used to map contiguous array sections to
@@ -47,37 +34,35 @@ template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_to([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(to : __p [0:__len])
+#  pragma omp target enter data map(to : __p[0 : __len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_from([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(from : __p [0:__len])
+#  pragma omp target exit data map(from : __p[0 : __len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_alloc([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(alloc : __p [0:__len])
+#  pragma omp target enter data map(alloc : __p[0 : __len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_release([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(release : __p [0:__len])
+#  pragma omp target exit data map(release : __p[0 : __len])
 }
 
-} // namespace __omp_gpu_backend
+} // namespace __omp_backend
 } // namespace __par_backend
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17
 
-_LIBCPP_POP_MACROS
-
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index e1c5f54675b387..eec6fb6fed8207 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -12,6 +12,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/transform.h>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__iterator/wrap_iter.h>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index 1684c3d273b223..a03dafabc8d387 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -11,7 +11,9 @@
 
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
+#include <__functional/operations.h>
 #include <__iterator/concepts.h>
 #include <__iterator/wrap_iter.h>
 #include <__numeric/transform_reduce.h>

>From 519004d6cea6b2aa0ba8648fea124de690596fe0 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 23 Oct 2023 12:29:08 -0700
Subject: [PATCH 27/65] Adding more comments and removing more includes

---
 .../__algorithm/pstl_backends/openmp.h        | 21 +++++++++++++++++++
 .../__algorithm/pstl_backends/openmp/any_of.h |  1 -
 .../pstl_backends/openmp/backend.h            |  1 -
 .../__algorithm/pstl_backends/openmp/fill.h   |  1 -
 .../pstl_backends/openmp/for_each.h           |  1 -
 .../pstl_backends/openmp/transform.h          |  1 -
 .../pstl_backends/openmp/transform_reduce.h   |  2 --
 7 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index a1d015e056837b..a3b52456dc6a60 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -22,6 +22,24 @@ is available at execution time. The target regions can also be compiled directly
 for a CPU architecture, for instance by adding the command-line option
 `-fopenmp-targets=x86_64-pc-linux-gnu` in Clang.
 
+When is an Algorithm Offloaded?
+-------------------------------
+Only parallel algorithms with the parallel unsequenced execution policy are
+offloaded to the device. We cannot offload parallel algorithms with a parallel
+execution policy to GPUs because invocations executing in the same thread "are
+indeterminately sequenced with respect to each other" which we cannot guarantee
+on a GPU.
+
+The standard draft states that "the semantics [...] allow the implementation to
+fall back to sequential execution if the system cannot parallelize an algorithm
+invocation". If it is not deemed safe to offload the parallel algorithm to the
+device, we first fall back to a parallel unsequenced implementation from
+./cpu_backends. The CPU implementation may then fall back to sequential
+execution. In that way we strive to achieve the best possible performance.
+
+Further, "it is the caller's responsibility to ensure that the invocation does
+not introduce data races or deadlocks."
+
 Implicit Assumptions
 --------------------
 If the user provides a function pointer as an argument to a parallel algorithm,
@@ -44,6 +62,9 @@ allowed to contain try/catch statements and throw expressions in Clang, but if a
 throw expression is reached, it will terminate the program. That does not
 conform with the C++ standard.
 
+[This document](https://eel.is/c++draft/algorithms.parallel) has been used as
+reference for these considerations.
+
 */
 
 #include <__algorithm/pstl_backends/openmp/backend.h>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index 65f2294ff2ee5f..ec5b4c4a4c3aa2 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -13,7 +13,6 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
-#include <__type_traits/is_execution_policy.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index eb5e40d6f20a94..401bb2a770379e 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -10,7 +10,6 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_BACKEND_H
 
 #include <__config>
-#include <cstddef>
 
 #include <__algorithm/pstl_backends/openmp/omp_offload.h>
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 53de1284acc60b..97e80c5ca853b2 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -17,7 +17,6 @@
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_pointer.h>
-#include <__utility/empty.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index ef73b864773fc1..e910d8dcc1d394 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -16,7 +16,6 @@
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__utility/empty.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index eec6fb6fed8207..f8aca11fc2102f 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -14,7 +14,6 @@
 #include <__algorithm/transform.h>
 #include <__algorithm/unwrap_iter.h>
 #include <__config>
-#include <__iterator/concepts.h>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
 #include <optional>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index a03dafabc8d387..9e83a91d10cade 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -14,10 +14,8 @@
 #include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__functional/operations.h>
-#include <__iterator/concepts.h>
 #include <__iterator/wrap_iter.h>
 #include <__numeric/transform_reduce.h>
-#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/operation_traits.h>

>From 2e750fe523dc48923258d9d031b69325236697f5 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 23 Oct 2023 12:49:36 -0700
Subject: [PATCH 28/65] Fix Apple build bot

---
 libcxx/cmake/caches/Apple.cmake | 2 +-
 libcxx/src/CMakeLists.txt       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/cmake/caches/Apple.cmake b/libcxx/cmake/caches/Apple.cmake
index 804eccd3a5dc5e..e1aea0313cb9f1 100644
--- a/libcxx/cmake/caches/Apple.cmake
+++ b/libcxx/cmake/caches/Apple.cmake
@@ -7,7 +7,7 @@ set(LIBCXX_ENABLE_STATIC ON CACHE BOOL "")
 set(LIBCXX_ENABLE_SHARED ON CACHE BOOL "")
 set(LIBCXX_CXX_ABI libcxxabi CACHE STRING "")
 set(LIBCXX_ENABLE_VENDOR_AVAILABILITY_ANNOTATIONS ON CACHE BOOL "")
-set(LIBCXX_PSTL_CPU_BACKEND libdispatch CACHE STRING "")
+set(LIBCXX_PSTL_BACKEND libdispatch CACHE STRING "")
 
 set(LIBCXX_HERMETIC_STATIC_LIBRARY ON CACHE BOOL "")
 set(LIBCXXABI_HERMETIC_STATIC_LIBRARY ON CACHE BOOL "")
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 156dbe8a4c2f92..00215aa6ba9400 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -326,7 +326,7 @@ set(LIBCXX_EXPERIMENTAL_SOURCES
   experimental/memory_resource.cpp
   )
 
-if (LIBCXX_PSTL_CPU_BACKEND STREQUAL "libdispatch")
+if (LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   list(APPEND LIBCXX_EXPERIMENTAL_SOURCES
     pstl/libdispatch.cpp
     )

>From ce586861b2c1e3ed6019538714e6304a563ca9ab Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 23 Oct 2023 14:30:10 -0700
Subject: [PATCH 29/65] Including empty.h in for_each and fill

---
 libcxx/include/__algorithm/pstl_backends/openmp/fill.h     | 1 +
 libcxx/include/__algorithm/pstl_backends/openmp/for_each.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 97e80c5ca853b2..53de1284acc60b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -17,6 +17,7 @@
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_pointer.h>
+#include <__utility/empty.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index e910d8dcc1d394..ef73b864773fc1 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -16,6 +16,7 @@
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
+#include <__utility/empty.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)

>From 882d8fd366218aee43750c06248214e88ccf7afc Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 25 Oct 2023 22:51:03 -0700
Subject: [PATCH 30/65] Adding support for OpenMP compilation of LIT tests

---
 libcxx/CMakeLists.txt                                 |  2 ++
 .../include/__algorithm/pstl_backends/openmp/fill.h   |  5 ++++-
 .../__algorithm/pstl_backends/openmp/find_if.h        |  5 ++++-
 .../__algorithm/pstl_backends/openmp/for_each.h       |  5 ++++-
 .../__algorithm/pstl_backends/openmp/transform.h      | 11 +++++++++--
 .../pstl_backends/openmp/transform_reduce.h           |  9 +++++++--
 libcxx/test/CMakeLists.txt                            |  8 ++++++++
 libcxx/utils/libcxx/test/params.py                    |  8 ++++++++
 libcxxabi/CMakeLists.txt                              |  8 ++++++++
 9 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index ca78038ab359d9..48b33f49c8014e 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -781,6 +781,8 @@ elseif (LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
   config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
+  # Making sure that OpenMP is enabled during build
+  add_compile_options(-fopenmp)
 else()
   message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
                        Valid backends are: serial, std-thread, libdispatch, and openmp.")
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 53de1284acc60b..7a9c379491a5f2 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -16,6 +16,7 @@
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
+#include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/remove_pointer.h>
 #include <__utility/empty.h>
 #include <optional>
@@ -46,7 +47,9 @@ __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   // lambda returning a constant.
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first))> > &&
+                is_trivially_copyable_v<_Tp>) {
     std::__rewrap_iter(__first, std::__omp_fill(std::__unwrap_iter(__first), __last - __first, __value));
     return __empty{};
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index db128e058823db..609638ebb3d250 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -15,6 +15,8 @@
 #include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__iterator/wrap_iter.h>
+#include <__type_traits/is_trivially_copyable.h>
+#include <__type_traits/remove_pointer.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -46,7 +48,8 @@ __pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __l
   // implementation of find_if
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first))> >) {
     return std::__rewrap_iter(__first, std::__omp_find_if(std::__unwrap_iter(__first), __last - __first, __pred));
     // Else we rey on the CPU PSTL backend
   } else {
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index ef73b864773fc1..e29255a4892ee4 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -16,6 +16,8 @@
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
+#include <__type_traits/is_trivially_copyable.h>
+#include <__type_traits/remove_pointer.h>
 #include <__utility/empty.h>
 #include <optional>
 
@@ -44,7 +46,8 @@ __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __
   // implementation of for_each
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first))> >) {
     std::__omp_for_each(std::__unwrap_iter(__first), __last - __first, __func);
     return __empty{};
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index f8aca11fc2102f..3353f74ed9166d 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -16,6 +16,8 @@
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
+#include <__type_traits/is_trivially_copyable.h>
+#include <__type_traits/remove_pointer.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -84,7 +86,9 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
+                __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first))> > &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__result))> >) {
     std::__rewrap_iter(
         __result,
         std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op));
@@ -110,7 +114,10 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
+                __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first1))> > &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first2))> > &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__result))> >) {
     return std::__rewrap_iter(
         __result,
         std::__omp_transform(
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index 9e83a91d10cade..6a074fb6e4fa9b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -18,7 +18,9 @@
 #include <__numeric/transform_reduce.h>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
+#include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/operation_traits.h>
+#include <__type_traits/remove_pointer.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -144,7 +146,8 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
-                __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
+                __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first))> >) {
     return std::__omp_transform_reduce(std::__unwrap_iter(__first), __last - __first, __init, __reduce, __transform);
   }
   return std::__pstl_transform_reduce<_ExecutionPolicy>(
@@ -169,7 +172,9 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
-                __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value) {
+                __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first1))> > &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first2))> >) {
     return std::__omp_transform_reduce(
         std::__unwrap_iter(__first1), std::__unwrap_iter(__first2), __last1 - __first1, __init, __reduce, __transform);
   }
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index 48dd233462ab3b..ae468498c9c876 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -35,6 +35,14 @@ if (LLVM_USE_SANITIZER)
   serialize_lit_string_param(SERIALIZED_LIT_PARAMS use_sanitizer "${LLVM_USE_SANITIZER}")
 endif()
 
+# If the OpenMP PSTL backend was enabled, the OpenMP compilation toolchain must
+# also be enabled for the LIT tests
+if (DEFINED LIBCXX_PSTL_BACKEND)
+  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+    serialize_lit_string_param(SERIALIZED_LIT_PARAMS enable_openmp "ON")
+  endif()
+endif()
+
 serialize_lit_params_list(SERIALIZED_LIT_PARAMS LIBCXX_TEST_PARAMS)
 
 if (NOT DEFINED LIBCXX_TEST_DEPS)
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index 2ead641354e585..5a2bdfff8a9bf5 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -330,6 +330,14 @@ def getStdFlag(cfg, std):
         default=f"{shlex.quote(sys.executable)} {shlex.quote(str(Path(__file__).resolve().parent.parent.parent / 'run.py'))}",
         help="Custom executor to use instead of the configured default.",
         actions=lambda executor: [AddSubstitution("%{executor}", executor)],
+    ),
+    Parameter(
+        name="enable_openmp",
+        choices=[True, False],
+        type=bool,
+        default=False,
+        help="Enable the OpenMP compilation toolchain if the PSTL backend was set to OpenMP.",
+        actions=lambda enabled: [AddCompileFlag("-fopenmp")] if enabled else [],
     )
 ]
 # fmt: on
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index 6fd4f02c750f5b..8078dffd507644 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -414,6 +414,14 @@ if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
   add_definitions("-D_XOPEN_SOURCE=700")
 endif()
 
+# If the OpenMP PSTL backend has been enabled for libcxx, OpenMP must be
+# enabled during compilation
+if (DEFINED LIBCXX_PSTL_BACKEND)
+  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+    add_compile_options(-fopenmp)
+  endif()
+endif()
+
 #===============================================================================
 # Setup Source Code
 #===============================================================================

>From c6fd6df1f18267f3eb5295bf6e2b8d2819ca4aec Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 27 Oct 2023 17:55:10 -0700
Subject: [PATCH 31/65] Adding tests for the OpenMP PSTL backend

---
 libcxx/test/CMakeLists.txt                    |  2 +-
 .../alg.pstl.offload/fill_offload.pass.cpp    | 41 ++++++++++++
 .../alg.pstl.offload/find_if.pass.cpp         | 67 +++++++++++++++++++
 .../alg.pstl.offload/find_if_offload.pass.cpp | 41 ++++++++++++
 .../for_each_offload.pass.cpp                 | 41 ++++++++++++
 .../for_each_overwrite_input.pass.cpp         | 64 ++++++++++++++++++
 .../openmp_version_40.verify.cpp              | 21 ++++++
 .../openmp_version_45.verify.cpp              | 21 ++++++
 .../openmp_version_51.verify.cpp              | 21 ++++++
 .../transform_offload.pass.cpp                | 63 +++++++++++++++++
 .../transform_reduce_offload.pass.cpp         | 45 +++++++++++++
 libcxx/utils/libcxx/test/params.py            | 13 +++-
 12 files changed, 437 insertions(+), 3 deletions(-)
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp

diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index ae468498c9c876..34b5ee3f5551e2 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -39,7 +39,7 @@ endif()
 # also be enabled for the LIT tests
 if (DEFINED LIBCXX_PSTL_BACKEND)
   if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
-    serialize_lit_string_param(SERIALIZED_LIT_PARAMS enable_openmp "ON")
+    serialize_lit_string_param(SERIALIZED_LIT_PARAMS openmp_pstl_backend "ON")
   endif()
 endif()
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
new file mode 100644
index 00000000000000..bd143b63225847
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but for_each is not executed on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(void) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int __test_size = 10000;
+  std::vector<int> __v(__test_size);
+  std::for_each(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) {
+    // Returns true if executed on the host
+    n = omp_is_initial_device();
+  });
+
+  auto __idx = std::find_if(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) -> bool { return n > 0; });
+  assert(__idx == __v.end() &&
+         "omp_is_initial_device() returned true in the target region. std::for_each was not offloaded.");
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
new file mode 100644
index 00000000000000..6a61887a443d02
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that you can overwrite the input in std::for_each. If the
+// result was not copied back from the device to the host, this test would fail.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <execution>
+#include <vector>
+
+template <class _Tp>
+void check_find_if(_Tp& __data) {
+  const int __len = __data.end() - __data.begin();
+  // Setting all elements to two except for the indexes in __idx
+  int __idx[11] = {
+      0, __len / 10, __len / 9, __len / 8, __len / 7, __len / 6, __len / 5, __len / 4, __len / 3, __len / 2, __len - 1};
+  std::fill(std::execution::par_unseq, __data.begin(), __data.end(), 2);
+  for (auto __i : __idx) {
+    __data[__i]--;
+  };
+  // Asserting that the minimas are found in the correct order
+  for (auto __i : __idx) {
+    auto __found_min = std::find_if(
+        std::execution::par_unseq, __data.begin(), __data.end(), [&](decltype(__data[0])& n) -> bool { return n < 2; });
+    assert(__found_min == (__data.begin() + __i));
+    // Incrementing the minimum, so the next one can be found
+    (*__found_min)++;
+  }
+}
+
+int main(void) {
+  const int __test_size = 10000;
+  // Testing with vector of doubles
+  {
+    std::vector<double> __v(__test_size);
+    check_find_if(__v);
+  }
+  // Testing with vector of integers
+  {
+    std::vector<int> __v(__test_size);
+    check_find_if(__v);
+  }
+  // Testing with array of doubles
+  {
+    std::array<double, __test_size> __a;
+    check_find_if(__a);
+  }
+  // Testing with array of integers
+  {
+    std::array<int, __test_size> __a;
+    check_find_if(__a);
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
new file mode 100644
index 00000000000000..4e91f52673c547
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but find_if is not executed on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(void) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int __test_size = 10000;
+  std::vector<double> __v(__test_size);
+  std::fill(std::execution::par_unseq, __v.begin(), __v.end(), 1.0);
+
+  auto __idx = std::find_if(std::execution::par_unseq, __v.begin(), __v.end(), [](double&) -> bool {
+    // Returns true if executed on the host
+    return omp_is_initial_device();
+  });
+  assert(__idx == __v.end() &&
+         "omp_is_initial_device() returned true in the target region. std::find_if was not offloaded.");
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
new file mode 100644
index 00000000000000..bd143b63225847
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but for_each is not executed on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(void) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int __test_size = 10000;
+  std::vector<int> __v(__test_size);
+  std::for_each(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) {
+    // Returns true if executed on the host
+    n = omp_is_initial_device();
+  });
+
+  auto __idx = std::find_if(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) -> bool { return n > 0; });
+  assert(__idx == __v.end() &&
+         "omp_is_initial_device() returned true in the target region. std::for_each was not offloaded.");
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
new file mode 100644
index 00000000000000..4f60b8dcc78838
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that you can overwrite the input in std::for_each. If the
+// result was not copied back from the device to the host, this test would fail.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp --offload-arch=native
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <execution>
+#include <vector>
+
+template <class _Tp, class _Predicate, class _Up>
+void overwrite(_Tp& __data, _Predicate __pred, const _Up& __value) {
+  // This function assumes that __pred will never be the identity transformation
+  // Filling array with __value
+  std::fill(std::execution::par_unseq, __data.begin(), __data.end(), __value);
+
+  // Updating the array with a lambda
+  std::for_each(std::execution::par_unseq, __data.begin(), __data.end(), __pred);
+
+  // Asserting that no elements have the intial value
+  auto __idx = std::find_if(
+      std::execution::par_unseq, __data.begin(), __data.end(), [&, __value](decltype(__data[0])& n) -> bool {
+        return n == __value;
+      });
+  assert(__idx == __data.end());
+}
+
+int main(void) {
+  const int __test_size = 10000;
+  // Testing with vector of doubles
+  {
+    std::vector<double> __v(__test_size);
+    overwrite(__v, [&](double& __n) { __n *= __n; }, 2.0);
+  }
+  // Testing with vector of integers
+  {
+    std::vector<int> __v(__test_size);
+    overwrite(__v, [&](int& __n) { __n *= __n; }, 2);
+  }
+  // Testing with array of doubles
+  {
+    std::array<double, __test_size> __a;
+    overwrite(__a, [&](double& __n) { __n *= __n; }, 2.0);
+  }
+  // Testing with array of integers
+  {
+    std::array<int, __test_size> __a;
+    overwrite(__a, [&](int& __n) { __n *= __n; }, 2);
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp
new file mode 100644
index 00000000000000..ec6d567d67226f
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that a diagnostic error is prompted if the OpenMP version is below
+// the minimum required version.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=40
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+
+// expected-error at __algorithm/pstl_backends/openmp/backend.h:26 {{"OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."}}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp
new file mode 100644
index 00000000000000..881f0ee16e0a86
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that one can include algorithm without any diagnostics when using
+// the minimum required version of OpenMP.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=45
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+
+// expected-no-diagnostics
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp
new file mode 100644
index 00000000000000..d9a0242606bd16
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that one can include algorithm without any diagnostics when using a
+// version that is newer than the minimum requirement.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=51
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+
+// expected-no-diagnostics
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
new file mode 100644
index 00000000000000..755c1054b469e1
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but transform is not executed on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(void) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int __test_size = 10000;
+  std::vector<int> __host(__test_size);
+  std::vector<int> __device(__test_size);
+  // Should execute on host
+  std::transform(std::execution::unseq, __host.begin(), __host.end(), __host.begin(), [](int& h) {
+    // Returns true if executed on the host
+    h = omp_is_initial_device();
+    return h;
+  });
+
+  // Finding first index where omp_is_initial_device() returned true
+  auto __idx = std::find_if(std::execution::par_unseq, __host.begin(), __host.end(), [](int& n) -> bool { return n; });
+  assert(__idx == __host.begin() &&
+         "omp_is_initial_device() returned false. std::transform was offloaded but shouldn't be.");
+
+  // Should execute on device
+  std::transform(
+      std::execution::par_unseq,
+      __device.begin(),
+      __device.end(),
+      __host.begin(),
+      __device.begin(),
+      [](int& d, int& h) {
+        // Should return fals
+        d = omp_is_initial_device();
+        return h == d;
+      });
+
+  // Finding first index where omp_is_initial_device() returned true
+  __idx = std::find_if(std::execution::par_unseq, __device.begin(), __device.end(), [](int& n) -> bool { return n; });
+  assert(__idx == __device.end() &&
+         "omp_is_initial_device() returned true in the target region. std::transform was not offloaded.");
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
new file mode 100644
index 00000000000000..072f0ffbaf9e6b
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but transform_reduce is not executed on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+#include <functional>
+
+int main(void) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int __test_size = 10000;
+  std::vector<int> __v(__test_size);
+  std::vector<int> __w(__test_size);
+  std::for_each(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) { n = !omp_is_initial_device(); });
+
+  std::for_each(std::execution::par_unseq, __w.begin(), __w.end(), [](int& n) { n = !omp_is_initial_device(); });
+
+  int result = std::transform_reduce(
+      std::execution::par_unseq, __v.begin(), __v.end(), __w.begin(), (int)0, std::plus{}, [](int& n, int& m) {
+        return n + m + omp_is_initial_device();
+      });
+  assert(result == 2 * __test_size &&
+         "omp_is_initial_device() returned true in the target region. std::transform_reduce was not offloaded.");
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index 5a2bdfff8a9bf5..df6aa5aa0b7661 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -332,12 +332,21 @@ def getStdFlag(cfg, std):
         actions=lambda executor: [AddSubstitution("%{executor}", executor)],
     ),
     Parameter(
-        name="enable_openmp",
+        name="openmp_pstl_backend",
         choices=[True, False],
         type=bool,
         default=False,
         help="Enable the OpenMP compilation toolchain if the PSTL backend was set to OpenMP.",
-        actions=lambda enabled: [AddCompileFlag("-fopenmp")] if enabled else [],
+        actions=lambda enabled: [
+            AddCompileFlag("-fopenmp"),
+            # The linker needs to find the correct version of libomptarget
+            AddLinkFlag("-Wl,-rpath,%{lib}/../../lib"),
+            AddLinkFlag("-L%{lib}/../../lib"),
+            #  The preprocessor needs to find the omp.h header
+            AddFlag("-I %{lib}/../../runtimes/runtimes-bins/openmp/runtime/src"),
+            # If the OpenMP PSTL backend was enbaled, we wish to run the tests for it
+            AddFeature("openmp_pstl_backend")
+        ] if enabled else [],
     )
 ]
 # fmt: on

>From f49b23350c35fcf78276f58ed230604f4e3b94b6 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 30 Oct 2023 23:56:02 -0700
Subject: [PATCH 32/65] Updating tests to avoid issues found in code review

---
 .../alg.pstl.offload/fill_offload.pass.cpp    |  37 ++--
 .../alg.pstl.offload/find_if.pass.cpp         |  52 ++---
 .../alg.pstl.offload/find_if_offload.pass.cpp |  14 +-
 .../for_each_offload.pass.cpp                 |  16 +-
 .../for_each_overwrite_input.pass.cpp         |  45 ++--
 .../openmp_version_40.verify.cpp              |   2 +-
 .../openmp_version_45.verify.cpp              |   2 +-
 .../openmp_version_51.verify.cpp              |   2 +-
 .../transform_offload.pass.cpp                |  36 ++--
 .../transform_reduce_offload.pass.cpp         |  22 +-
 ...educe_supported_binary_operations.pass.cpp | 201 ++++++++++++++++++
 11 files changed, 319 insertions(+), 110 deletions(-)
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
index bd143b63225847..1557a667b81e39 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // This test will fail if the number of devices detected by OpenMP is larger
-// than zero but for_each is not executed on the device.
+// than zero but std::for_each(std::execution::par_unseq,...) is not executed on
+// the device.
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
@@ -27,15 +28,27 @@ int main(void) {
     return 0;
 
   // Initializing test array
-  const int __test_size = 10000;
-  std::vector<int> __v(__test_size);
-  std::for_each(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) {
-    // Returns true if executed on the host
-    n = omp_is_initial_device();
-  });
-
-  auto __idx = std::find_if(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) -> bool { return n > 0; });
-  assert(__idx == __v.end() &&
-         "omp_is_initial_device() returned true in the target region. std::for_each was not offloaded.");
+  const int test_size = 10000;
+  std::vector<int> v(test_size, 2);
+
+  // By making an extra map, we can control when the data is mapped to and from
+  // the device, because the map inside std::fill will then only increment and
+  // decrement reference counters and not move data.
+  int* data = v.data();
+#pragma omp target enter data map(to : data[0 : v.size()])
+  std::fill(std::execution::par_unseq, v.begin(), v.end(), -2);
+
+  // At this point v should only contain the value 2
+  for (int vi : v)
+    assert(vi == 2 &&
+           "std::fill transferred data from device to the host but should only have decreased the reference counter.");
+
+// After moving the result back to the host it should now be -2
+#pragma omp target update from(data[0 : v.size()])
+  for (int vi : v)
+    assert(vi == -2 && "std::fill did not update the result on the device.");
+
+#pragma omp target exit data map(delete : data[0 : v.size()])
+
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
index 6a61887a443d02..d368701f571241 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This test verifies that you can overwrite the input in std::for_each. If the
-// result was not copied back from the device to the host, this test would fail.
+// This test verifies that std::find_if(std::execution::par_unseq,...) always
+// finds the first entry in a vector matching the condition. If it was confused
+// with std::any_of, it could return the indexes in a non-increasing order.
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
@@ -22,46 +23,47 @@
 #include <vector>
 
 template <class _Tp>
-void check_find_if(_Tp& __data) {
-  const int __len = __data.end() - __data.begin();
-  // Setting all elements to two except for the indexes in __idx
-  int __idx[11] = {
-      0, __len / 10, __len / 9, __len / 8, __len / 7, __len / 6, __len / 5, __len / 4, __len / 3, __len / 2, __len - 1};
-  std::fill(std::execution::par_unseq, __data.begin(), __data.end(), 2);
-  for (auto __i : __idx) {
-    __data[__i]--;
+void check_find_if(_Tp& data) {
+  const int len = data.end() - data.begin();
+  // Decrementing the values in the test indices
+  int idx[11] = {0, len / 10, len / 9, len / 8, len / 7, len / 6, len / 5, len / 4, len / 3, len / 2, len - 1};
+  for (auto i : idx) {
+    data[i] -= 1;
   };
+
   // Asserting that the minimas are found in the correct order
-  for (auto __i : __idx) {
-    auto __found_min = std::find_if(
-        std::execution::par_unseq, __data.begin(), __data.end(), [&](decltype(__data[0])& n) -> bool { return n < 2; });
-    assert(__found_min == (__data.begin() + __i));
+  for (auto i : idx) {
+    auto found_min = std::find_if(
+        std::execution::par_unseq, data.begin(), data.end(), [&](decltype(data[0])& n) -> bool { return n < 2; });
+    assert(found_min == (data.begin() + i));
     // Incrementing the minimum, so the next one can be found
-    (*__found_min)++;
+    (*found_min) += 1;
   }
 }
 
 int main(void) {
-  const int __test_size = 10000;
+  const int test_size = 10000;
   // Testing with vector of doubles
   {
-    std::vector<double> __v(__test_size);
-    check_find_if(__v);
+    std::vector<double> v(test_size, 2.0);
+    check_find_if(v);
   }
   // Testing with vector of integers
   {
-    std::vector<int> __v(__test_size);
-    check_find_if(__v);
+    std::vector<int> v(test_size, 2);
+    check_find_if(v);
   }
   // Testing with array of doubles
   {
-    std::array<double, __test_size> __a;
-    check_find_if(__a);
+    std::array<double, test_size> a;
+    a.fill(2.0);
+    check_find_if(a);
   }
   // Testing with array of integers
   {
-    std::array<int, __test_size> __a;
-    check_find_if(__a);
+    std::array<int, test_size> a;
+    a.fill(2);
+    check_find_if(a);
   }
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
index 4e91f52673c547..89a884de80984b 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // This test will fail if the number of devices detected by OpenMP is larger
-// than zero but find_if is not executed on the device.
+// than zero but syd::find_if(std::execution::par_unseq,...) is not executed on
+// the device.
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
@@ -27,15 +28,14 @@ int main(void) {
     return 0;
 
   // Initializing test array
-  const int __test_size = 10000;
-  std::vector<double> __v(__test_size);
-  std::fill(std::execution::par_unseq, __v.begin(), __v.end(), 1.0);
+  const int test_size = 10000;
+  std::vector<double> v(test_size, 1);
 
-  auto __idx = std::find_if(std::execution::par_unseq, __v.begin(), __v.end(), [](double&) -> bool {
+  auto idx = std::find_if(std::execution::par_unseq, v.begin(), v.end(), [](double&) -> bool {
     // Returns true if executed on the host
     return omp_is_initial_device();
   });
-  assert(__idx == __v.end() &&
+  assert(idx == v.end() &&
          "omp_is_initial_device() returned true in the target region. std::find_if was not offloaded.");
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
index bd143b63225847..6ee3310507c11d 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // This test will fail if the number of devices detected by OpenMP is larger
-// than zero but for_each is not executed on the device.
+// than zero but for_each(std::execution::par_unseq,...) is not executed on the
+// device.
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
@@ -27,15 +28,14 @@ int main(void) {
     return 0;
 
   // Initializing test array
-  const int __test_size = 10000;
-  std::vector<int> __v(__test_size);
-  std::for_each(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) {
+  const int test_size = 10000;
+  std::vector<int> v(test_size);
+  std::for_each(std::execution::par_unseq, v.begin(), v.end(), [](int& n) {
     // Returns true if executed on the host
     n = omp_is_initial_device();
   });
 
-  auto __idx = std::find_if(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) -> bool { return n > 0; });
-  assert(__idx == __v.end() &&
-         "omp_is_initial_device() returned true in the target region. std::for_each was not offloaded.");
+  for (int vi : v)
+    assert(vi == 0 && "omp_is_initial_device() returned true in the target region. std::for_each was not offloaded.");
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index 4f60b8dcc78838..a39aa2cf2977f6 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This test verifies that you can overwrite the input in std::for_each. If the
-// result was not copied back from the device to the host, this test would fail.
+// This test verifies that you can overwrite the input in
+// std::for_each(std::execution::par_unseq,...). If the result was not copied
+// back from the device to the host, this test would fail.
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
@@ -22,43 +23,43 @@
 #include <vector>
 
 template <class _Tp, class _Predicate, class _Up>
-void overwrite(_Tp& __data, _Predicate __pred, const _Up& __value) {
-  // This function assumes that __pred will never be the identity transformation
-  // Filling array with __value
-  std::fill(std::execution::par_unseq, __data.begin(), __data.end(), __value);
+void overwrite(_Tp& data, _Predicate pred, const _Up& value) {
+  // This function assumes that pred will never be the identity transformation
 
   // Updating the array with a lambda
-  std::for_each(std::execution::par_unseq, __data.begin(), __data.end(), __pred);
+  std::for_each(std::execution::par_unseq, data.begin(), data.end(), pred);
 
   // Asserting that no elements have the intial value
-  auto __idx = std::find_if(
-      std::execution::par_unseq, __data.begin(), __data.end(), [&, __value](decltype(__data[0])& n) -> bool {
-        return n == __value;
-      });
-  assert(__idx == __data.end());
+  for (int di : data)
+    assert(
+        di != value &&
+        "The GPU implementation of std::for_each does not allow users to mutate the input as the C++ standard does.");
 }
 
 int main(void) {
-  const int __test_size = 10000;
+  const double value  = 2.0;
+  const int test_size = 10000;
   // Testing with vector of doubles
   {
-    std::vector<double> __v(__test_size);
-    overwrite(__v, [&](double& __n) { __n *= __n; }, 2.0);
+    std::vector<double> v(test_size, value);
+    overwrite(v, [&](double& n) { n *= n; }, value);
   }
   // Testing with vector of integers
   {
-    std::vector<int> __v(__test_size);
-    overwrite(__v, [&](int& __n) { __n *= __n; }, 2);
+    std::vector<int> v(test_size, (int)value);
+    overwrite(v, [&](int& n) { n *= n; }, (int)value);
   }
   // Testing with array of doubles
   {
-    std::array<double, __test_size> __a;
-    overwrite(__a, [&](double& __n) { __n *= __n; }, 2.0);
+    std::array<double, test_size> a;
+    a.fill(value);
+    overwrite(a, [&](double& n) { n *= n; }, value);
   }
   // Testing with array of integers
   {
-    std::array<int, __test_size> __a;
-    overwrite(__a, [&](int& __n) { __n *= __n; }, 2);
+    std::array<int, test_size> a;
+    a.fill((int)value);
+    overwrite(a, [&](int& n) { n *= n; }, (int)value);
   }
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp
index ec6d567d67226f..77836d47b08120 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp
@@ -18,4 +18,4 @@
 
 #include <algorithm>
 
-// expected-error at __algorithm/pstl_backends/openmp/backend.h:26 {{"OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."}}
\ No newline at end of file
+// expected-error at __algorithm/pstl_backends/openmp/backend.h:26 {{"OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."}}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp
index 881f0ee16e0a86..ff0bf1cf67c8e8 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp
@@ -18,4 +18,4 @@
 
 #include <algorithm>
 
-// expected-no-diagnostics
\ No newline at end of file
+// expected-no-diagnostics
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp
index d9a0242606bd16..401285586caee2 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp
@@ -18,4 +18,4 @@
 
 #include <algorithm>
 
-// expected-no-diagnostics
\ No newline at end of file
+// expected-no-diagnostics
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
index 755c1054b469e1..d813a8828fc52e 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // This test will fail if the number of devices detected by OpenMP is larger
-// than zero but transform is not executed on the device.
+// than zero but std::transform(std::execution::par_unseq,...) is not executed
+// on the device.
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
@@ -26,38 +27,31 @@ int main(void) {
   if (omp_get_num_devices() < 1)
     return 0;
 
-  // Initializing test array
-  const int __test_size = 10000;
-  std::vector<int> __host(__test_size);
-  std::vector<int> __device(__test_size);
+  // Initializing test arrays
+  const int test_size = 10000;
+  std::vector<int> host(test_size);
+  std::vector<int> device(test_size);
   // Should execute on host
-  std::transform(std::execution::unseq, __host.begin(), __host.end(), __host.begin(), [](int& h) {
+  std::transform(std::execution::unseq, host.begin(), host.end(), host.begin(), [](int& h) {
     // Returns true if executed on the host
     h = omp_is_initial_device();
     return h;
   });
 
-  // Finding first index where omp_is_initial_device() returned true
-  auto __idx = std::find_if(std::execution::par_unseq, __host.begin(), __host.end(), [](int& n) -> bool { return n; });
-  assert(__idx == __host.begin() &&
-         "omp_is_initial_device() returned false. std::transform was offloaded but shouldn't be.");
+  // Asserting the std::transform(std::execution::unseq,...) executed on the host
+  for (int hi : host)
+    assert(hi && "omp_is_initial_device() returned false. std::transform was offloaded but shouldn't be.");
 
   // Should execute on device
   std::transform(
-      std::execution::par_unseq,
-      __device.begin(),
-      __device.end(),
-      __host.begin(),
-      __device.begin(),
-      [](int& d, int& h) {
+      std::execution::par_unseq, device.begin(), device.end(), host.begin(), device.begin(), [](int& d, int& h) {
         // Should return fals
         d = omp_is_initial_device();
         return h == d;
       });
 
-  // Finding first index where omp_is_initial_device() returned true
-  __idx = std::find_if(std::execution::par_unseq, __device.begin(), __device.end(), [](int& n) -> bool { return n; });
-  assert(__idx == __device.end() &&
-         "omp_is_initial_device() returned true in the target region. std::transform was not offloaded.");
+  // Asserting the std::transform(std::execution::par_unseq,...) executed on the device
+  for (int di : device)
+    assert(!di && "omp_is_initial_device() returned true in the target region. std::transform was not offloaded.");
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
index 072f0ffbaf9e6b..20acd2be45f85e 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // This test will fail if the number of devices detected by OpenMP is larger
-// than zero but transform_reduce is not executed on the device.
+// than zero but std::transform_reduce(std::execution::par_unseq,...) is not
+// executed on the device.
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
@@ -18,9 +19,9 @@
 #include <algorithm>
 #include <cassert>
 #include <execution>
+#include <functional>
 #include <vector>
 #include <omp.h>
-#include <functional>
 
 int main(void) {
   // We only run the test if a device is detected by OpenMP
@@ -28,18 +29,15 @@ int main(void) {
     return 0;
 
   // Initializing test array
-  const int __test_size = 10000;
-  std::vector<int> __v(__test_size);
-  std::vector<int> __w(__test_size);
-  std::for_each(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) { n = !omp_is_initial_device(); });
-
-  std::for_each(std::execution::par_unseq, __w.begin(), __w.end(), [](int& n) { n = !omp_is_initial_device(); });
+  const int test_size = 10000;
+  std::vector<int> v(test_size, 1);
+  std::vector<int> w(test_size, 1);
 
   int result = std::transform_reduce(
-      std::execution::par_unseq, __v.begin(), __v.end(), __w.begin(), (int)0, std::plus{}, [](int& n, int& m) {
-        return n + m + omp_is_initial_device();
+      std::execution::par_unseq, v.begin(), v.end(), w.begin(), (int)0, std::plus{}, [](int& n, int& m) {
+        return n + m + omp_is_initial_device(); // Gives 2 if executed on device, 3 if executed on host
       });
-  assert(result == 2 * __test_size &&
+  assert(result == 2 * test_size &&
          "omp_is_initial_device() returned true in the target region. std::transform_reduce was not offloaded.");
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
new file mode 100644
index 00000000000000..77046f1e52f64f
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
@@ -0,0 +1,201 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that std::transform_reduce(std::execution::par_unseq,...)
+// can be offloaded for a number of supported binary operations. The following
+// binary operations should be supported for the reducer:
+// - std::plus
+// - std::minus
+// - std::multiplies
+// - std::logical_and
+// - std::logical_or
+// - std::bit_and
+// - std::bit_or
+// - std::bit_xor
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <execution>
+#include <functional>
+#include <vector>
+#include <omp.h>
+#include <iostream>
+
+int main(void) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int test_size = 10000;
+
+  //===--------------------------------------------------------------------===//
+  // Arithmetic binary operators
+  //===--------------------------------------------------------------------===//
+
+  // Addition with doubles
+  {
+    std::vector<double> v(test_size, 1.0);
+    std::vector<double> w(test_size, 2.0);
+    double result = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), w.begin(), 5.0, std::plus{}, [](double& a, double& b) {
+          return 0.5 * (b - a) * ((double)!omp_is_initial_device());
+        });
+    assert((std::abs(result - 0.5 * ((double)test_size) - 5.0) < 1e-8) &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::plus.");
+  }
+
+  // Subtraction of floats
+  {
+    std::vector<float> v(test_size, 1.0f);
+    std::vector<float> w(test_size, 1.5f);
+    float result = std::transform_reduce(
+        std::execution::par_unseq,
+        v.begin(),
+        v.end(),
+        w.begin(),
+        1.25 * ((float)test_size),
+        std::minus{},
+        [](float& a, float& b) { return 0.5 * (a + b) * ((float)!omp_is_initial_device()); });
+    assert((std::abs(result) < 1e-8f) &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the "
+           "intended effect for the binary operation std::minus.");
+  }
+
+  // Multiplication of doubles
+  {
+    std::vector<double> v(test_size, 1.0);
+    std::vector<double> w(test_size, 0.0001);
+    double result = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), w.begin(), -1.0, std::multiplies{}, [](double& a, double& b) {
+          return (a + b) * ((double)!omp_is_initial_device());
+        });
+    assert((std::abs(result + pow(1.0001, test_size)) < 1e-8) &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::multiplies.");
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Logical binary operators
+  //===--------------------------------------------------------------------===//
+
+  // Logical and
+  {
+    std::vector<int> v(test_size, 1);
+    // The result should be true with an initial value of 1
+    int result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 1, std::logical_and{}, [](int& a) {
+          return a && !omp_is_initial_device();
+        });
+    assert(result &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::logical_and.");
+
+    // And false by an initial value of 0
+    result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::logical_and{}, [](int& a) {
+      return a && !omp_is_initial_device();
+    });
+    assert(!result &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::logical_and.");
+  }
+
+  // Logical or
+  {
+    std::vector<int> v(test_size, 0);
+    // The result should be true with an initial value of 1
+    int result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 1, std::logical_or{}, [](int& a) {
+      return a && !omp_is_initial_device();
+    });
+    assert(result &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::logical_or.");
+
+    // And false by an initial value of 0
+    result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::logical_or{}, [](int& a) {
+      return a && !omp_is_initial_device();
+    });
+    assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                      "binary operation std::logical_or.");
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Birwise binary operators
+  //===--------------------------------------------------------------------===//
+
+  // Bitwise and
+  {
+    std::vector<unsigned int> v(test_size, 3);
+    std::vector<unsigned int> w(test_size, 2);
+    // For odd numbers the result should be true
+    int result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0x1, std::bit_and{}, [](unsigned int& a) {
+          return a + omp_is_initial_device();
+        });
+    assert(result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                     "binary operation std::bit_and.");
+
+    // For even numbers the result should be false
+    result =
+        std::transform_reduce(std::execution::par_unseq, w.begin(), w.end(), 0x1, std::bit_and{}, [](unsigned int& a) {
+          return a + omp_is_initial_device();
+        });
+    assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                      "binary operation std::bit_and.");
+  }
+
+  // Bitwise or
+  {
+    std::vector<unsigned int> v(test_size, 0);
+    int result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) {
+          return a || omp_is_initial_device();
+        });
+    assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                      "binary operation std::bit_or.");
+
+    // After adding a one, the result should be true
+    v[v.size() / 2] = 1;
+    result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) {
+          return a && !omp_is_initial_device();
+        });
+    assert(result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                     "binary operation std::bit_or.");
+  }
+
+  // Bitwise xor
+  {
+    std::vector<unsigned int> v(test_size, 0xef);
+    int result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_xor{}, [](unsigned int& a) {
+          return a << omp_is_initial_device();
+        });
+    assert(result == 0 && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for "
+                          "the binary operation std::bit_or.");
+
+    // After adding a one, the result should be true
+    v[v.size() / 2] = 0xea;
+    result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_xor{}, [](unsigned int& a) {
+          return a << omp_is_initial_device();
+        });
+    assert(result == 5 && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for "
+                          "the binary operation std::bit_or.");
+  }
+
+  return 0;
+}

>From 11e9d8b37613b38b1d022abc3ea87e9e6eda2c16 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 1 Nov 2023 10:05:03 -0700
Subject: [PATCH 33/65] Adding -fno-exceptions to tests to conform with
 https://github.com/llvm/llvm-project/pull/69669

---
 .../libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp  | 2 +-
 .../test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp  | 2 +-
 .../algorithms/alg.pstl.offload/find_if_offload.pass.cpp      | 2 +-
 .../algorithms/alg.pstl.offload/for_each_offload.pass.cpp     | 2 +-
 .../alg.pstl.offload/for_each_overwrite_input.pass.cpp        | 2 +-
 .../algorithms/alg.pstl.offload/transform_offload.pass.cpp    | 2 +-
 .../alg.pstl.offload/transform_reduce_offload.pass.cpp        | 2 +-
 .../transform_reduce_supported_binary_operations.pass.cpp     | 2 +-
 libcxx/utils/libcxx/test/params.py                            | 4 ++++
 9 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
index 1557a667b81e39..7dc5983d8f8e99 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
index d368701f571241..791e748a30c71c 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
index 89a884de80984b..e26c33e97f77bd 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
index 6ee3310507c11d..7a78e4282a8cb0 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index a39aa2cf2977f6..bfd2e2819d953e 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fopenmp --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
index d813a8828fc52e..3f8c91b366ca4c 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
index 20acd2be45f85e..229956215c6e12 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
index 77046f1e52f64f..4f8974625f5634 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
@@ -20,7 +20,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index df6aa5aa0b7661..b3b64215ef2413 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -343,6 +343,10 @@ def getStdFlag(cfg, std):
             AddLinkFlag("-Wl,-rpath,%{lib}/../../lib"),
             AddLinkFlag("-L%{lib}/../../lib"),
             #  The preprocessor needs to find the omp.h header
+            # If OpenMP was installed as a project, the header lives in the
+            # following directory
+            AddFlag("-I %{lib}/../../projects/openmp/runtime/src/"),
+            # And if it was installed as a runtime it lives in
             AddFlag("-I %{lib}/../../runtimes/runtimes-bins/openmp/runtime/src"),
             # If the OpenMP PSTL backend was enbaled, we wish to run the tests for it
             AddFeature("openmp_pstl_backend")

>From 85655301dbd0e32bbd0952eb85372131727ff13d Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 1 Nov 2023 16:46:17 -0700
Subject: [PATCH 34/65] Passing on environment variables to GPU offloading
 tests

---
 libcxx/docs/UsingLibcxx.rst                   |  9 ++--
 .../__algorithm/pstl_backends/openmp/fill.h   |  2 +-
 .../pstl_backends/openmp/find_if.h            |  2 +-
 .../pstl_backends/openmp/for_each.h           |  2 +-
 .../pstl_backends/openmp/transform.h          |  4 +-
 .../pstl_backends/openmp/transform_reduce.h   |  4 +-
 libcxx/test/configs/llvm-libc++-shared.cfg.in |  2 +-
 .../alg.pstl.offload/fill_offload.pass.cpp    |  2 +-
 .../alg.pstl.offload/find_if.pass.cpp         |  2 +-
 .../alg.pstl.offload/find_if_offload.pass.cpp |  2 +-
 .../for_each_offload.pass.cpp                 |  2 +-
 .../for_each_overwrite_input.pass.cpp         |  2 +-
 .../gpu_environemt_variables.pass.cpp         | 51 +++++++++++++++++++
 .../transform_offload.pass.cpp                |  2 +-
 .../transform_reduce_offload.pass.cpp         |  2 +-
 ...educe_supported_binary_operations.pass.cpp | 10 ++--
 libcxx/utils/libcxx/test/params.py            |  2 +-
 libcxx/utils/run.py                           | 15 ++++++
 18 files changed, 90 insertions(+), 27 deletions(-)
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp

diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index c9726ef4f60401..ce8fb04a8ec156 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -544,20 +544,17 @@ pointer can be obtained with `target map(from:<list of identifiers>)`.
 
   int main()
   {
-    int * a =  new int[LEN];
-    // Initialize the array to 2 on the device
-    std::fill(std::execution::par_unseq,a, a+LEN,2);
+    std::vector<int> a(LEN,2);
     // Get the device pointer for cube
     void (*dcube)(int& n);
     #pragma omp target map(from:dcube)
     dcube = &cube;
     // Pass the device function pointer to the parallel algorithm
-    std::for_each(std::execution::par_unseq,a, a+LEN,dcube);
+    std::for_each(std::execution::par_unseq,a.begin(), a.end(),dcube);
     // Validate that the result is 8 on the host for all array indices
-    std::for_each(std::execution::par,a, a+LEN,[&](int & n){
+    std::for_each(std::execution::par,a.begin(), a.end(),[&](int & n){
       assert(n == 8);
     });
-    delete[] a;
     return 0;
   }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 7a9c379491a5f2..6d2ba97df3c809 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -32,7 +32,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp, class _DifferenceType, class _Up>
 _LIBCPP_HIDE_FROM_ABI _Tp* __omp_fill(_Tp* __out1, _DifferenceType __n, const _Up& __value) noexcept {
   __par_backend::__omp_map_alloc(__out1, __n);
-#  pragma omp target teams distribute parallel for simd
+#  pragma omp target teams distribute parallel for
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     *(__out1 + __i) = __value;
   __par_backend::__omp_map_from(__out1, __n);
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 609638ebb3d250..bf569a4e61520a 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -31,7 +31,7 @@ template <class _Tp, class _DifferenceType, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI _Tp* __omp_find_if(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
   __par_backend::__omp_map_to(__first, __n);
   _DifferenceType __idx = __n;
-#  pragma omp target teams distribute parallel for simd reduction(min : __idx)
+#  pragma omp target teams distribute parallel for reduction(min : __idx)
   for (_DifferenceType __i = 0; __i < __n; ++__i) {
     if (__pred(*(__first + __i))) {
       __idx = (__i < __idx) ? __i : __idx;
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index e29255a4892ee4..9e5d49009b0e7a 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -32,7 +32,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp, class _DifferenceType, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Tp* __omp_for_each(_Tp* __inout1, _DifferenceType __n, _Function __f) noexcept {
   __par_backend::__omp_map_to(__inout1, __n);
-#  pragma omp target teams distribute parallel for simd
+#  pragma omp target teams distribute parallel for
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(*(__inout1 + __i));
   __par_backend::__omp_map_from(__inout1, __n);
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index 3353f74ed9166d..c3bfc4228a2252 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -40,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp* __omp_transform(_Tp* __in1, _DifferenceType __n, _Up*
   // allocate the buffer on the device without copying the data.
   __par_backend::__omp_map_to(__in1, __n);
   __par_backend::__omp_map_alloc(__out1, __n);
-#  pragma omp target teams distribute parallel for simd
+#  pragma omp target teams distribute parallel for
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     *(__out1 + __i) = __f(*(__in1 + __i));
   // The order of the following two maps matters, since the user could legally
@@ -62,7 +62,7 @@ __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Funct
   __par_backend::__omp_map_to(__in1, __n);
   __par_backend::__omp_map_to(__in2, __n);
   __par_backend::__omp_map_alloc(__out1, __n);
-#  pragma omp target teams distribute parallel for simd
+#  pragma omp target teams distribute parallel for
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     *(__out1 + __i) = __f(*(__in1 + __i), *(__in2 + __i));
   // The order of the following three maps matters, since the user could legally
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index 6a074fb6e4fa9b..de617423899a07 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -48,7 +48,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
         std_op<_BinaryOperationType> __reduce,                                                                         \
         _UnaryOperation __transform) noexcept {                                                                        \
       __par_backend::__omp_map_to(__first, __n);                                                                       \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
+_PSTL_PRAGMA(omp target teams distribute parallel for reduction(omp_op:__init))                                        \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
       __par_backend::__omp_map_release(__first, __n);                                                                  \
@@ -71,7 +71,7 @@ _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__in
         _UnaryOperation __transform) noexcept {                                                                        \
       __par_backend::__omp_map_to(__first1, __n);                                                                      \
       __par_backend::__omp_map_to(__first2, __n);                                                                      \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
+_PSTL_PRAGMA(omp target teams distribute parallel for reduction(omp_op:__init))                                        \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
       __par_backend::__omp_map_release(__first1, __n);                                                                 \
diff --git a/libcxx/test/configs/llvm-libc++-shared.cfg.in b/libcxx/test/configs/llvm-libc++-shared.cfg.in
index 143b3b3feae110..8ffe69f6271510 100644
--- a/libcxx/test/configs/llvm-libc++-shared.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared.cfg.in
@@ -24,4 +24,4 @@ libcxx.test.config.configure(
     libcxx.test.features.DEFAULT_FEATURES,
     config,
     lit_config
-)
+)
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
index 7dc5983d8f8e99..a97cb42845eb6f 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
index 791e748a30c71c..38544d76b86ad7 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
index e26c33e97f77bd..82425be01c181f 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget  -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
index 7a78e4282a8cb0..e2a2797e5515dc 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index bfd2e2819d953e..8729b29ac07213 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
new file mode 100644
index 00000000000000..eb9265556cb0da
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that the libc++ test configuration forwards the AMD and
+// NVIDIA environment variables specifying the visible devices. Intially when
+// developing the OpenMP offloading tests, this was not the case, and this test
+// will reveal if the configuration is wrong another time.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget  -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+
+// REQUIRES: openmp_pstl_backend
+
+#include <string>
+#include <cassert>
+#include <omp.h>
+#include <iostream>
+
+std::string get_env_var(std::string const& env_var_name, int& flag) {
+  char* val;
+  val                = getenv(env_var_name.c_str());
+  std::string retval = "";
+  flag               = (val != NULL);
+  return (val != NULL) ? val : "";
+}
+
+int main(void) {
+  // Stores whether the environment variable was found
+  int status = 0;
+
+  // Checking for AMD's enviroment variable for specifying visible devices
+  std::string rocr_visible_devices = get_env_var("ROCR_VISIBLE_DEVICES", status);
+  if (status)
+    assert(
+        (rocr_visible_devices.empty() || (omp_get_num_devices() > 0)) &&
+        "ROCR_VISIBLE_DEVICES was set but no devices were detected by OpenMP. The libc++ test suite is misconfigured.");
+
+  // Checking for NVIDIA's enviroment variable for specifying visible devices
+  std::string cuda_visible_devices = get_env_var("CUDA_VISIBLE_DEVICES", status);
+  if (status)
+    assert(
+        (cuda_visible_devices.empty() || (omp_get_num_devices() > 0)) &&
+        "CUDA_VISIBLE_DEVICES was set but no devices were detected by OpenMP. The libc++ test suite is misconfigured.");
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
index 3f8c91b366ca4c..1be15fd15454ff 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
index 229956215c6e12..30d45060c4ec28 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
index 4f8974625f5634..593a63e57c806b 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
@@ -20,7 +20,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
@@ -160,8 +160,8 @@ int main(void) {
   // Bitwise or
   {
     std::vector<unsigned int> v(test_size, 0);
-    int result =
-        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) {
+    int result = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) -> unsigned int {
           return a || omp_is_initial_device();
         });
     assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
@@ -169,8 +169,8 @@ int main(void) {
 
     // After adding a one, the result should be true
     v[v.size() / 2] = 1;
-    result =
-        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) {
+    result          = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) -> unsigned int {
           return a && !omp_is_initial_device();
         });
     assert(result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index b3b64215ef2413..5f99533d5b4beb 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -341,7 +341,7 @@ def getStdFlag(cfg, std):
             AddCompileFlag("-fopenmp"),
             # The linker needs to find the correct version of libomptarget
             AddLinkFlag("-Wl,-rpath,%{lib}/../../lib"),
-            AddLinkFlag("-L%{lib}/../../lib"),
+            #AddLinkFlag("-L%{lib}/../../lib -lomptarget"),
             #  The preprocessor needs to find the omp.h header
             # If OpenMP was installed as a project, the header lives in the
             # following directory
diff --git a/libcxx/utils/run.py b/libcxx/utils/run.py
index 6b4d615444bcfa..5baffadb35ec06 100755
--- a/libcxx/utils/run.py
+++ b/libcxx/utils/run.py
@@ -63,6 +63,21 @@ def main():
         # TEMP is needed for placing temp files in a sensible directory.
         if "TEMP" in os.environ:
             env["TEMP"] = os.environ.get("TEMP")
+    
+    # Forwarding the environment variable CUDA_VISIBLE_DEVICES which configures
+    # the visible NVIDIA GPUs.
+    if 'CUDA_VISIBLE_DEVICES' in os.environ:
+        env['CUDA_VISIBLE_DEVICES'] = os.environ['CUDA_VISIBLE_DEVICES']
+
+    # Forwarding the environment variable ROCR_VISIBLE_DEVICES which configures
+    # the visible AMD GPUs.
+    if 'ROCR_VISIBLE_DEVICES' in os.environ:
+        env['ROCR_VISIBLE_DEVICES'] = os.environ['ROCR_VISIBLE_DEVICES']
+
+    # Pass the OpenMP debug flag. Can be used to print information about the
+    # GPU execution of the tests.
+    if 'LIBOMPTARGET_DEBUG' in os.environ:
+        env['LIBOMPTARGET_DEBUG'] = os.environ['LIBOMPTARGET_DEBUG']
 
     # Run the command line with the given environment in the execution directory.
     return subprocess.call(commandLine, cwd=args.execdir, env=env, shell=False)

>From c3540c65f04a6683a89e73d4ed0a12c8fa201d5a Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 10:11:15 -0700
Subject: [PATCH 35/65] Clang-formatted for_each_overwrite_input.pass.cpp

---
 .../for_each_overwrite_input.pass.cpp                | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index 8729b29ac07213..5c984fca88a468 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -42,24 +42,28 @@ int main(void) {
   // Testing with vector of doubles
   {
     std::vector<double> v(test_size, value);
-    overwrite(v, [&](double& n) { n *= n; }, value);
+    overwrite(
+        v, [&](double& n) { n *= n; }, value);
   }
   // Testing with vector of integers
   {
     std::vector<int> v(test_size, (int)value);
-    overwrite(v, [&](int& n) { n *= n; }, (int)value);
+    overwrite(
+        v, [&](int& n) { n *= n; }, (int)value);
   }
   // Testing with array of doubles
   {
     std::array<double, test_size> a;
     a.fill(value);
-    overwrite(a, [&](double& n) { n *= n; }, value);
+    overwrite(
+        a, [&](double& n) { n *= n; }, value);
   }
   // Testing with array of integers
   {
     std::array<int, test_size> a;
     a.fill((int)value);
-    overwrite(a, [&](int& n) { n *= n; }, (int)value);
+    overwrite(
+        a, [&](int& n) { n *= n; }, (int)value);
   }
   return 0;
 }

>From 22d43a7b5fa15fc99b253485345f3f5240eaf0e3 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 10:14:07 -0700
Subject: [PATCH 36/65] clang-formatted omp_offload.h and fill_offload.pass.cpp

---
 .../__algorithm/pstl_backends/openmp/omp_offload.h   |  8 ++++----
 .../alg.pstl.offload/fill_offload.pass.cpp           |  6 +++---
 libcxx/utils/libcxx/test/params.py                   | 12 ++++++------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index d47fdcd39c39b2..b2e0d46d1fcd25 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -34,28 +34,28 @@ template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_to([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(to : __p[0 : __len])
+#  pragma omp target enter data map(to : __p [0:__len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_from([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(from : __p[0 : __len])
+#  pragma omp target exit data map(from : __p [0:__len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_alloc([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(alloc : __p[0 : __len])
+#  pragma omp target enter data map(alloc : __p [0:__len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_release([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(release : __p[0 : __len])
+#  pragma omp target exit data map(release : __p [0:__len])
 }
 
 } // namespace __omp_backend
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
index a97cb42845eb6f..c6e14b13e9b186 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -35,7 +35,7 @@ int main(void) {
   // the device, because the map inside std::fill will then only increment and
   // decrement reference counters and not move data.
   int* data = v.data();
-#pragma omp target enter data map(to : data[0 : v.size()])
+#pragma omp target enter data map(to : data [0:v.size()])
   std::fill(std::execution::par_unseq, v.begin(), v.end(), -2);
 
   // At this point v should only contain the value 2
@@ -44,11 +44,11 @@ int main(void) {
            "std::fill transferred data from device to the host but should only have decreased the reference counter.");
 
 // After moving the result back to the host it should now be -2
-#pragma omp target update from(data[0 : v.size()])
+#pragma omp target update from(data [0:v.size()])
   for (int vi : v)
     assert(vi == -2 && "std::fill did not update the result on the device.");
 
-#pragma omp target exit data map(delete : data[0 : v.size()])
+#pragma omp target exit data map(delete : data [0:v.size()])
 
   return 0;
 }
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index 5f99533d5b4beb..ffc731b93fc069 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -341,14 +341,14 @@ def getStdFlag(cfg, std):
             AddCompileFlag("-fopenmp"),
             # The linker needs to find the correct version of libomptarget
             AddLinkFlag("-Wl,-rpath,%{lib}/../../lib"),
-            #AddLinkFlag("-L%{lib}/../../lib -lomptarget"),
-            #  The preprocessor needs to find the omp.h header
-            # If OpenMP was installed as a project, the header lives in the
-            # following directory
+            # The preprocessor needs to find the omp.h header. If OpenMP was 
+            # installed as a project, the header lives in the following
+            # directory
             AddFlag("-I %{lib}/../../projects/openmp/runtime/src/"),
-            # And if it was installed as a runtime it lives in
+            # And if it was installed as a runtime it lives in the following:
             AddFlag("-I %{lib}/../../runtimes/runtimes-bins/openmp/runtime/src"),
-            # If the OpenMP PSTL backend was enbaled, we wish to run the tests for it
+            # If the OpenMP PSTL backend was enbaled, we wish to run the tests
+            # for it
             AddFeature("openmp_pstl_backend")
         ] if enabled else [],
     )

>From c24ee8886a7d4921ce6f14cd159968ba4d52c293 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 10:19:39 -0700
Subject: [PATCH 37/65] Changed single quotes to double quotes in run.py

---
 libcxx/utils/run.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libcxx/utils/run.py b/libcxx/utils/run.py
index 5baffadb35ec06..e172449cf3d18b 100755
--- a/libcxx/utils/run.py
+++ b/libcxx/utils/run.py
@@ -66,18 +66,18 @@ def main():
     
     # Forwarding the environment variable CUDA_VISIBLE_DEVICES which configures
     # the visible NVIDIA GPUs.
-    if 'CUDA_VISIBLE_DEVICES' in os.environ:
-        env['CUDA_VISIBLE_DEVICES'] = os.environ['CUDA_VISIBLE_DEVICES']
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        env["CUDA_VISIBLE_DEVICES"] = os.environ["CUDA_VISIBLE_DEVICES"]
 
     # Forwarding the environment variable ROCR_VISIBLE_DEVICES which configures
     # the visible AMD GPUs.
-    if 'ROCR_VISIBLE_DEVICES' in os.environ:
-        env['ROCR_VISIBLE_DEVICES'] = os.environ['ROCR_VISIBLE_DEVICES']
+    if "ROCR_VISIBLE_DEVICES" in os.environ:
+        env["ROCR_VISIBLE_DEVICES"] = os.environ["ROCR_VISIBLE_DEVICES"]
 
     # Pass the OpenMP debug flag. Can be used to print information about the
     # GPU execution of the tests.
-    if 'LIBOMPTARGET_DEBUG' in os.environ:
-        env['LIBOMPTARGET_DEBUG'] = os.environ['LIBOMPTARGET_DEBUG']
+    if "LIBOMPTARGET_DEBUG" in os.environ:
+        env["LIBOMPTARGET_DEBUG"] = os.environ["LIBOMPTARGET_DEBUG"]
 
     # Run the command line with the given environment in the execution directory.
     return subprocess.call(commandLine, cwd=args.execdir, env=env, shell=False)

>From 930c96305a7753c3d15db98bc2cd8b2204a1f50b Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 10:43:59 -0700
Subject: [PATCH 38/65] Did I finally clang-format omp_offload.h,
 fill_offload.pass.cpp, and for_each_overwrite_input.pass.cpp right?

---
 .../__algorithm/pstl_backends/openmp/omp_offload.h   |  8 ++++----
 .../alg.pstl.offload/fill_offload.pass.cpp           |  6 +++---
 .../for_each_overwrite_input.pass.cpp                | 12 ++++--------
 3 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index b2e0d46d1fcd25..d47fdcd39c39b2 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -34,28 +34,28 @@ template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_to([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(to : __p [0:__len])
+#  pragma omp target enter data map(to : __p[0 : __len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_from([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(from : __p [0:__len])
+#  pragma omp target exit data map(from : __p[0 : __len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_alloc([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(alloc : __p [0:__len])
+#  pragma omp target enter data map(alloc : __p[0 : __len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_release([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(release : __p [0:__len])
+#  pragma omp target exit data map(release : __p[0 : __len])
 }
 
 } // namespace __omp_backend
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
index c6e14b13e9b186..a97cb42845eb6f 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -35,7 +35,7 @@ int main(void) {
   // the device, because the map inside std::fill will then only increment and
   // decrement reference counters and not move data.
   int* data = v.data();
-#pragma omp target enter data map(to : data [0:v.size()])
+#pragma omp target enter data map(to : data[0 : v.size()])
   std::fill(std::execution::par_unseq, v.begin(), v.end(), -2);
 
   // At this point v should only contain the value 2
@@ -44,11 +44,11 @@ int main(void) {
            "std::fill transferred data from device to the host but should only have decreased the reference counter.");
 
 // After moving the result back to the host it should now be -2
-#pragma omp target update from(data [0:v.size()])
+#pragma omp target update from(data[0 : v.size()])
   for (int vi : v)
     assert(vi == -2 && "std::fill did not update the result on the device.");
 
-#pragma omp target exit data map(delete : data [0:v.size()])
+#pragma omp target exit data map(delete : data[0 : v.size()])
 
   return 0;
 }
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index 5c984fca88a468..8729b29ac07213 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -42,28 +42,24 @@ int main(void) {
   // Testing with vector of doubles
   {
     std::vector<double> v(test_size, value);
-    overwrite(
-        v, [&](double& n) { n *= n; }, value);
+    overwrite(v, [&](double& n) { n *= n; }, value);
   }
   // Testing with vector of integers
   {
     std::vector<int> v(test_size, (int)value);
-    overwrite(
-        v, [&](int& n) { n *= n; }, (int)value);
+    overwrite(v, [&](int& n) { n *= n; }, (int)value);
   }
   // Testing with array of doubles
   {
     std::array<double, test_size> a;
     a.fill(value);
-    overwrite(
-        a, [&](double& n) { n *= n; }, value);
+    overwrite(a, [&](double& n) { n *= n; }, value);
   }
   // Testing with array of integers
   {
     std::array<int, test_size> a;
     a.fill((int)value);
-    overwrite(
-        a, [&](int& n) { n *= n; }, (int)value);
+    overwrite(a, [&](int& n) { n *= n; }, (int)value);
   }
   return 0;
 }

>From 2ac232555e962c1e57da9a4d394d623bb5f5645b Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 12:05:10 -0700
Subject: [PATCH 39/65] run.py whitespace

---
 libcxx/utils/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/utils/run.py b/libcxx/utils/run.py
index e172449cf3d18b..427e96b91fa370 100755
--- a/libcxx/utils/run.py
+++ b/libcxx/utils/run.py
@@ -63,7 +63,7 @@ def main():
         # TEMP is needed for placing temp files in a sensible directory.
         if "TEMP" in os.environ:
             env["TEMP"] = os.environ.get("TEMP")
-    
+
     # Forwarding the environment variable CUDA_VISIBLE_DEVICES which configures
     # the visible NVIDIA GPUs.
     if "CUDA_VISIBLE_DEVICES" in os.environ:

>From a985cb36a942bffccc980afa7afda6ac971a6375 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 13:32:20 -0700
Subject: [PATCH 40/65] Fixed code formatting in .rst documentation

---
 libcxx/CMakeLists.txt                                  |  7 +++++--
 libcxx/docs/UsingLibcxx.rst                            | 10 +++++-----
 .../__algorithm/pstl_backends/openmp/omp_offload.h     |  2 +-
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 48b33f49c8014e..8f336da5e04d99 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -558,6 +558,11 @@ function(cxx_add_basic_build_flags target)
     endif()
   endif()
   target_compile_options(${target} PUBLIC "${LIBCXX_ADDITIONAL_COMPILE_FLAGS}")
+
+  # If the PSTL backend depends on OpenMP, we must enable the OpenMP tool chain
+  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+    target_add_compile_flags_if_supported(${target} PUBLIC -fopenmp)
+  endif()
 endfunction()
 
 # Exception flags =============================================================
@@ -781,8 +786,6 @@ elseif (LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
   config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
-  # Making sure that OpenMP is enabled during build
-  add_compile_options(-fopenmp)
 else()
   message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
                        Valid backends are: serial, std-thread, libdispatch, and openmp.")
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index ce8fb04a8ec156..da267b1fe56ffe 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -477,8 +477,8 @@ iterators for ``std::vector`` or ``std::array``.
 To enable the OpenMP offloading backend it must be selected with
 ``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when
 compiling a program, the user must specify the command line options
-``-fopenmp -fexperimental-library -stdlib=libc++``. To install LLVM with OpenMP
-offloading enabled, please read
+``-fopenmp -fexperimental-library``. To install LLVM with OpenMP offloading
+enabled, please read
 `the LLVM OpenMP FAQ. <https://openmp.llvm.org/SupportAndFAQ.html>`_ 
 You may also want to to visit
 `the OpenMP offloading command-line argument reference. <https://openmp.llvm.org/CommandLineArgumentReference.html#offload-command-line-arguments>`_ 
@@ -531,9 +531,9 @@ space. However, discrete GPU systems have distinct address spaces. A single
 address space can be emulated if your system supports unified shared memory.
 However, many discrete GPU systems do not, and in those cases it is important to
 pass device function pointers to the parallel algorithms. Below is an example of
-how the OpenMP `declare target` directive can be used to mark that a function
+how the OpenMP ``declare target`` directive can be used to mark that a function
 should be compiled for both host and device. The device address of a function
-pointer can be obtained with `target map(from:<list of identifiers>)`.
+pointer can be obtained with ``target map(from:<list of identifiers>)``.
 
 .. code-block:: cpp
 
@@ -559,7 +559,7 @@ pointer can be obtained with `target map(from:<list of identifiers>)`.
   }
 
 Without unified shared memory, the above example will not work if the host
-function pointer `cube` is passed to the parallel algorithm.
+function pointer ``cube`` is passed to the parallel algorithm.
 
 Important notes about exception handling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index d47fdcd39c39b2..d2bcfbb98754b3 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -27,7 +27,7 @@ inline namespace __omp_backend {
 //===----------------------------------------------------------------------===//
 // The following four functions can be used to map contiguous array sections to
 // and from the device. For now, they are simple overlays of the OpenMP pragmas,
-// but they should be updated wen adding support for other iterator types.
+// but they should be updated when adding support for other iterator types.
 //===----------------------------------------------------------------------===//
 
 template <class _Iterator, class _DifferenceType>

>From 05c2858e6a3295dd629aa0177ae1bb2d99dc4abb Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 15:19:20 -0700
Subject: [PATCH 41/65] Improved formatting of examples in documentation

---
 libcxx/docs/UsingLibcxx.rst | 56 ++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index da267b1fe56ffe..ecad68eac9a434 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -487,18 +487,19 @@ Example
 ~~~~~~~
 
 The following is an example of offloading vector addition to a GPU using our
-standard library extension.
+standard library extension. It implements the classical vector addition from
+BLAS that overwrites the vector ``y`` with ``y=a*x+y``. Thus ``y.begin()`` is
+both used as an input and an output iterator in this example.
 
 .. code-block:: cpp
 
   #include <algorithm>
   #include <execution>
 
-  template<typename T1, typename T2, typename T3>
-  void axpy(const T1 a,std::vector<T2>& x, std::vector<T3>& y)
-  {
-    std::transform(std::execution::par_unseq,x.begin(),x.end(), y.begin(), y.begin(),
-                  [=](T2 xi, T3 yi){ return a*xi + yi; });
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [=](T2 xi, T3 yi) { return a * xi + yi; });
   }
 
 The execution policy ``std::execution::par_unseq`` states that the algorithm's
@@ -512,12 +513,11 @@ be implemented in the following way.
 
 .. code-block:: cpp
 
-  template<typename T1, typename T2, typename T3>
-  void axpy(const T1 a,std::vector<T2>& x, std::vector<T3>& y)
-  {
-  # pragma omp target data map(to:a)
-    std::transform(std::execution::par_unseq,x.begin(),x.end(), y.begin(), y.begin(),
-                  [&](T2 xi, T3 yi){ return a*xi + yi; });
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+  #pragma omp target data map(to : a)
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [&](T2 xi, T3 yi) { return a * xi + yi; });
   }
 
 However, if unified shared memory, USM, is enabled, no additional data mapping
@@ -539,27 +539,31 @@ pointer can be obtained with ``target map(from:<list of identifiers>)``.
 
   // Declare that the function must be compiled for both host and device
   #pragma omp declare target
-  void cube(int& n) {n*=n*n; };
+  // This function computes the squared difference of two floating points
+  float squared(float a, float b) { return a * a - 2.0f * a * b + b * b; };
   #pragma omp end declare target
 
-  int main()
-  {
-    std::vector<int> a(LEN,2);
-    // Get the device pointer for cube
-    void (*dcube)(int& n);
-    #pragma omp target map(from:dcube)
-    dcube = &cube;
+  int main() {
+    std::vector<float> a(100, 1.0);
+    std::vector<float> b(100, 1.25);
+
+    // Get the device pointer for squared
+    float (*dev_squared)(float, float);
+  #pragma omp target map(from : dev_squared)
+    dev_squared = &squared;
+
     // Pass the device function pointer to the parallel algorithm
-    std::for_each(std::execution::par_unseq,a.begin(), a.end(),dcube);
-    // Validate that the result is 8 on the host for all array indices
-    std::for_each(std::execution::par,a.begin(), a.end(),[&](int & n){
-      assert(n == 8);
-    });
+    float sum =
+        std::transform_reduce(std::execution::par_unseq, a.begin(), a.end(),
+                              b.begin(), 0.0f, std::plus{}, dev_squared);
+
+    // Validate that the result is approximately 6.25
+    assert(std::abs(sum - 6.25f) < 1e-10);
     return 0;
   }
 
 Without unified shared memory, the above example will not work if the host
-function pointer ``cube`` is passed to the parallel algorithm.
+function pointer ``squared`` is passed to the parallel algorithm.
 
 Important notes about exception handling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

>From ab7c3d9c4136a8b5df97ce5912f25a8298128967 Mon Sep 17 00:00:00 2001
From: Anton Rydahl <44206479+AntonRydahl at users.noreply.github.com>
Date: Thu, 2 Nov 2023 15:54:18 -0700
Subject: [PATCH 42/65] Update libcxx/docs/UsingLibcxx.rst

Co-authored-by: Louis Dionne <ldionne.2 at gmail.com>
---
 libcxx/docs/UsingLibcxx.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index ecad68eac9a434..94e92982bc6519 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -569,7 +569,7 @@ Important notes about exception handling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 GPU architectures do not support exception handling. If compiling a program
-containing parallel algorithms with ``clang`` 18 or newer, a program with
+containing parallel algorithms with current versions of Clang, a program with
 exceptions in offloaded code regions will compile, but the program will
 terminate if an exception is thrown on the device. This does not conform with
 the C++ standard and exception handling on GPUs will hopefully be better

>From 84df0217a0e2b5b41b33d57c99750047928cf593 Mon Sep 17 00:00:00 2001
From: Anton Rydahl <44206479+AntonRydahl at users.noreply.github.com>
Date: Thu, 2 Nov 2023 15:55:10 -0700
Subject: [PATCH 43/65] Update
 libcxx/include/__algorithm/pstl_backends/openmp.h

Co-authored-by: Louis Dionne <ldionne.2 at gmail.com>
---
 libcxx/include/__algorithm/pstl_backends/openmp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index a3b52456dc6a60..e70ba1a0217912 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -60,7 +60,7 @@ Exceptions
 Currently, GPU architectures do not handle exceptions. OpenMP target regions are
 allowed to contain try/catch statements and throw expressions in Clang, but if a
 throw expression is reached, it will terminate the program. That does not
-conform with the C++ standard.
+conform to the C++ standard.
 
 [This document](https://eel.is/c++draft/algorithms.parallel) has been used as
 reference for these considerations.

>From 2ad39fe6d1512c59c56a8049634689aa9705f4ef Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 16:02:28 -0700
Subject: [PATCH 44/65] Change function declaration for main functions from int
 main(void) to int main(int,char**)

---
 .../libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp    | 2 +-
 libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp | 2 +-
 .../libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp | 2 +-
 .../algorithms/alg.pstl.offload/for_each_offload.pass.cpp       | 2 +-
 .../alg.pstl.offload/for_each_overwrite_input.pass.cpp          | 2 +-
 .../alg.pstl.offload/gpu_environemt_variables.pass.cpp          | 2 +-
 .../algorithms/alg.pstl.offload/transform_offload.pass.cpp      | 2 +-
 .../alg.pstl.offload/transform_reduce_offload.pass.cpp          | 2 +-
 .../transform_reduce_supported_binary_operations.pass.cpp       | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
index a97cb42845eb6f..c74384519cec34 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -22,7 +22,7 @@
 #include <vector>
 #include <omp.h>
 
-int main(void) {
+int main(int, char**) {
   // We only run the test if a device is detected by OpenMP
   if (omp_get_num_devices() < 1)
     return 0;
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
index 38544d76b86ad7..7d379c032d6122 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
@@ -41,7 +41,7 @@ void check_find_if(_Tp& data) {
   }
 }
 
-int main(void) {
+int main(int, char**) {
   const int test_size = 10000;
   // Testing with vector of doubles
   {
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
index 82425be01c181f..72cc4568aa26eb 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
@@ -22,7 +22,7 @@
 #include <vector>
 #include <omp.h>
 
-int main(void) {
+int main(int, char**) {
   // We only run the test if a device is detected by OpenMP
   if (omp_get_num_devices() < 1)
     return 0;
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
index e2a2797e5515dc..18417c3c9f0324 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
@@ -22,7 +22,7 @@
 #include <vector>
 #include <omp.h>
 
-int main(void) {
+int main(int, char**) {
   // We only run the test if a device is detected by OpenMP
   if (omp_get_num_devices() < 1)
     return 0;
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index 8729b29ac07213..5151984e7969c9 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -36,7 +36,7 @@ void overwrite(_Tp& data, _Predicate pred, const _Up& value) {
         "The GPU implementation of std::for_each does not allow users to mutate the input as the C++ standard does.");
 }
 
-int main(void) {
+int main(int, char**) {
   const double value  = 2.0;
   const int test_size = 10000;
   // Testing with vector of doubles
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
index eb9265556cb0da..0977f7a1f81048 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
@@ -30,7 +30,7 @@ std::string get_env_var(std::string const& env_var_name, int& flag) {
   return (val != NULL) ? val : "";
 }
 
-int main(void) {
+int main(int, char**) {
   // Stores whether the environment variable was found
   int status = 0;
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
index 1be15fd15454ff..74255ce0a99e2f 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
@@ -22,7 +22,7 @@
 #include <vector>
 #include <omp.h>
 
-int main(void) {
+int main(int, char**) {
   // We only run the test if a device is detected by OpenMP
   if (omp_get_num_devices() < 1)
     return 0;
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
index 30d45060c4ec28..d141ff307bb701 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
@@ -23,7 +23,7 @@
 #include <vector>
 #include <omp.h>
 
-int main(void) {
+int main(int, char**) {
   // We only run the test if a device is detected by OpenMP
   if (omp_get_num_devices() < 1)
     return 0;
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
index 593a63e57c806b..5cc57888c4549f 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
@@ -33,7 +33,7 @@
 #include <omp.h>
 #include <iostream>
 
-int main(void) {
+int main(int, char**) {
   // We only run the test if a device is detected by OpenMP
   if (omp_get_num_devices() < 1)
     return 0;

>From 877153a301ea5704ef7bc1be52f9cab26f193eb3 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 16:34:30 -0700
Subject: [PATCH 45/65] Improved documentation in
 include/__algorithm/pstl_backends/openmp.h

---
 .../__algorithm/pstl_backends/openmp.h        | 128 ++++++++++--------
 1 file changed, 71 insertions(+), 57 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index e70ba1a0217912..fdd23e051e5596 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -11,65 +11,79 @@
 
 #include <__config>
 
-/*
-Combined OpenMP CPU and GPU Backend
-===================================
-Contrary to the CPU backends found in ./cpu_backends/, the OpenMP backend can
-target both CPUs and GPUs. The OpenMP standard defines that when offloading code
-to an accelerator, the compiler must generate a fallback code for execution on
-the host. Thereby, the backend works as a CPU backend if no targeted accelerator
-is available at execution time. The target regions can also be compiled directly
-for a CPU architecture, for instance by adding the command-line option
-`-fopenmp-targets=x86_64-pc-linux-gnu` in Clang.
-
-When is an Algorithm Offloaded?
--------------------------------
-Only parallel algorithms with the parallel unsequenced execution policy are
-offloaded to the device. We cannot offload parallel algorithms with a parallel
-execution policy to GPUs because invocations executing in the same thread "are
-indeterminately sequenced with respect to each other" which we cannot guarantee
-on a GPU.
-
-The standard draft states that "the semantics [...] allow the implementation to
-fall back to sequential execution if the system cannot parallelize an algorithm
-invocation". If it is not deemed safe to offload the parallel algorithm to the
-device, we first fall back to a parallel unsequenced implementation from
-./cpu_backends. The CPU implementation may then fall back to sequential
-execution. In that way we strive to achieve the best possible performance.
-
-Further, "it is the caller's responsibility to ensure that the invocation does
-not introduce data races or deadlocks."
-
-Implicit Assumptions
---------------------
-If the user provides a function pointer as an argument to a parallel algorithm,
-it is assumed that it is the device pointer as there is currently no way to
-check whether a host or device pointer was passed.
-
-Mapping Clauses
----------------
-In some of the parallel algorithms, the user is allowed to provide the same
-iterator as input and output. Hence, the order of the maps matters. Therefore,
-`pragma omp target data map(to:...)` must be used before
-`pragma omp target data map(alloc:...)`. Conversely, the maps with map modifier
-`release` must be placed before the maps with map modifier `from` when
-transferring the result from the device to the host.
-
-Exceptions
-----------
-Currently, GPU architectures do not handle exceptions. OpenMP target regions are
-allowed to contain try/catch statements and throw expressions in Clang, but if a
-throw expression is reached, it will terminate the program. That does not
-conform to the C++ standard.
-
-[This document](https://eel.is/c++draft/algorithms.parallel) has been used as
-reference for these considerations.
-
-*/
-
-#include <__algorithm/pstl_backends/openmp/backend.h>
+// Combined OpenMP CPU and GPU Backend
+// ===================================
+// Contrary to the CPU backends found in ./cpu_backends/, the OpenMP backend can
+// target both CPUs and GPUs. The OpenMP standard defines that when offloading
+// code to an accelerator, the compiler must generate a fallback code for
+// execution on the host. Thereby, the backend works as a CPU backend if no
+// targeted accelerator is available at execution time. The target regions can
+// also be compiled directly for a CPU architecture, for instance by adding the
+// command-line option `-fopenmp-targets=x86_64-pc-linux-gnu` in Clang.
+//
+// When is an Algorithm Offloaded?
+// -------------------------------
+// Only parallel algorithms with the parallel unsequenced execution policy are
+// offloaded to the device. We cannot offload parallel algorithms with a
+// parallel execution policy to GPUs because invocations executing in the same
+// thread "are indeterminately sequenced with respect to each other" which we
+// cannot guarantee on a GPU.
+//
+// The standard draft states that "the semantics [...] allow the implementation
+// to fall back to sequential execution if the system cannot parallelize an
+// algorithm invocation". If it is not deemed safe to offload the parallel
+// algorithm to the device, we first fall back to a parallel unsequenced
+// implementation from ./cpu_backends. The CPU implementation may then fall back
+// to sequential execution. In that way we strive to achieve the best possible
+// performance.
+//
+// Further, "it is the caller's responsibility to ensure that the invocation
+// does not introduce data races or deadlocks."
+//
+// Implicit Assumptions
+// --------------------
+// If the user provides a function pointer as an argument to a parallel
+// algorithm, it is assumed that it is the device pointer as there is currently
+// no way to check whether a host or device pointer was passed.
+//
+// Mapping Clauses
+// ---------------
+// In some of the parallel algorithms, the user is allowed to provide the same
+// iterator as input and output. The order of the maps matters because OpenMP
+// keeps a reference counter of which variables have been mapped to the device.
+// Thereby, a varible is only copied to the device if its reference counter is
+// incremented from zero, and it is only copied back to the host when the
+// reference counter is decremented to zero again.
+// This allows nesting mapped regions, for instance in recursive functions,
+// without enforcing a lot of unnecessary data movement.
+// Therefore, `pragma omp target data map(to:...)` must be used before
+// `pragma omp target data map(alloc:...)`. Conversely, the maps with map
+// modifier `release` must be placed before the maps with map modifier `from`
+// when transferring the result from the device to the host.
+//
+// Example: Assume `a` and `b` are pointers to the same array.
+// ``` C++
+// #pragma omp target enter data map(alloc:a[0:n])
+// // The reference counter is incremented from 0 to 1. a is not copied to the
+// // device because of the `alloc` map modifier.
+// #pragma omp target enter data map(to:b[0:n])
+// // The reference counter is incremented from 1 to 2. b is not copied because
+// // the reference counter is positive. Therefore b, and a, are uninitialized
+// // on the device.
+// ```
+//
+// Exceptions
+// ----------
+// Currently, GPU architectures do not handle exceptions. OpenMP target regions
+// are allowed to contain try/catch statements and throw expressions in Clang,
+// but if a throw expression is reached, it will terminate the program. That
+// does not conform to the C++ standard.
+//
+// [This document](https://eel.is/c++draft/algorithms.parallel) has been used as
+// reference for these considerations.
 
 #include <__algorithm/pstl_backends/openmp/any_of.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/pstl_backends/openmp/fill.h>
 #include <__algorithm/pstl_backends/openmp/find_if.h>
 #include <__algorithm/pstl_backends/openmp/for_each.h>

>From 94494487307e7f3e0439ad15b313ed4192908774 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 16:42:55 -0700
Subject: [PATCH 46/65] Removing accidental change to
 libcxx/test/configs/llvm-libc++-shared.cfg.in

---
 libcxx/test/configs/llvm-libc++-shared.cfg.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/configs/llvm-libc++-shared.cfg.in b/libcxx/test/configs/llvm-libc++-shared.cfg.in
index 8ffe69f6271510..143b3b3feae110 100644
--- a/libcxx/test/configs/llvm-libc++-shared.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared.cfg.in
@@ -24,4 +24,4 @@ libcxx.test.config.configure(
     libcxx.test.features.DEFAULT_FEATURES,
     config,
     lit_config
-)
\ No newline at end of file
+)

>From 7a3dca7eef62558a9e577c068d554088d6a64550 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 3 Nov 2023 15:31:28 -0700
Subject: [PATCH 47/65] Made for_each_overwrite_input.pass.cpp compatible with
 clang-format 17.0.1

---
 .../for_each_overwrite_input.pass.cpp                | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index 5151984e7969c9..065aa23ba6b965 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -42,24 +42,28 @@ int main(int, char**) {
   // Testing with vector of doubles
   {
     std::vector<double> v(test_size, value);
-    overwrite(v, [&](double& n) { n *= n; }, value);
+    overwrite(
+        v, [&](double& n) { n *= n; }, value);
   }
   // Testing with vector of integers
   {
     std::vector<int> v(test_size, (int)value);
-    overwrite(v, [&](int& n) { n *= n; }, (int)value);
+    overwrite(
+        v, [&](int& n) { n *= n; }, (int)value);
   }
   // Testing with array of doubles
   {
     std::array<double, test_size> a;
     a.fill(value);
-    overwrite(a, [&](double& n) { n *= n; }, value);
+    overwrite(
+        a, [&](double& n) { n *= n; }, value);
   }
   // Testing with array of integers
   {
     std::array<int, test_size> a;
     a.fill((int)value);
-    overwrite(a, [&](int& n) { n *= n; }, (int)value);
+    overwrite(
+        a, [&](int& n) { n *= n; }, (int)value);
   }
   return 0;
 }

>From abc0f4920247207d7deb05b37673efd2b213a42f Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 6 Nov 2023 14:53:35 -0800
Subject: [PATCH 48/65] Moved feature openmp_pstl_backend to feature.py

---
 .../libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp   | 2 +-
 .../test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp   | 2 +-
 .../algorithms/alg.pstl.offload/find_if_offload.pass.cpp       | 2 +-
 .../algorithms/alg.pstl.offload/for_each_offload.pass.cpp      | 2 +-
 .../alg.pstl.offload/for_each_overwrite_input.pass.cpp         | 2 +-
 .../alg.pstl.offload/gpu_environemt_variables.pass.cpp         | 2 +-
 .../algorithms/alg.pstl.offload/transform_offload.pass.cpp     | 2 +-
 .../alg.pstl.offload/transform_reduce_offload.pass.cpp         | 2 +-
 .../transform_reduce_supported_binary_operations.pass.cpp      | 2 +-
 libcxx/utils/libcxx/test/features.py                           | 1 +
 libcxx/utils/libcxx/test/params.py                             | 3 ---
 11 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
index c74384519cec34..905cfd836e945b 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
index 7d379c032d6122..48851596a4aa90 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
index 72cc4568aa26eb..538d71dbfa0fa3 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget  -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget  -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
index 18417c3c9f0324..942637c6e661be 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index 065aa23ba6b965..7bed8f7e80c5e2 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
index 0977f7a1f81048..71d400ea4ccef6 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
@@ -13,7 +13,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget  -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
index 74255ce0a99e2f..9dd7609bb5b9b2 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
index d141ff307bb701..f10efbce19c28c 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
index 5cc57888c4549f..594b4a4fcf0bef 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
@@ -20,7 +20,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 29822f55521360..0d4b8bc7c5c232 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -305,6 +305,7 @@ def _getAndroidDeviceApi(cfg):
     "_LIBCPP_HAS_NO_UNICODE": "libcpp-has-no-unicode",
     "_LIBCPP_HAS_NO_STD_MODULES":  "libcpp-has-no-std-modules",
     "_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH": "libcpp-pstl-cpu-backend-libdispatch",
+    "_LIBCPP_PSTL_BACKEND_OPENMP": "openmp_pstl_backend"
 }
 for macro, feature in macros.items():
     DEFAULT_FEATURES.append(
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index ffc731b93fc069..008bfea095cc9b 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -347,9 +347,6 @@ def getStdFlag(cfg, std):
             AddFlag("-I %{lib}/../../projects/openmp/runtime/src/"),
             # And if it was installed as a runtime it lives in the following:
             AddFlag("-I %{lib}/../../runtimes/runtimes-bins/openmp/runtime/src"),
-            # If the OpenMP PSTL backend was enbaled, we wish to run the tests
-            # for it
-            AddFeature("openmp_pstl_backend")
         ] if enabled else [],
     )
 ]

>From 3c72664abaf6983d2c5f50d70082909094bf7ba9 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 6 Nov 2023 17:53:38 -0800
Subject: [PATCH 49/65] Formatted utils/libcxx/test/features.py with darker

---
 libcxx/utils/libcxx/test/features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 0d4b8bc7c5c232..dc574b46d4935f 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -305,7 +305,7 @@ def _getAndroidDeviceApi(cfg):
     "_LIBCPP_HAS_NO_UNICODE": "libcpp-has-no-unicode",
     "_LIBCPP_HAS_NO_STD_MODULES":  "libcpp-has-no-std-modules",
     "_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH": "libcpp-pstl-cpu-backend-libdispatch",
-    "_LIBCPP_PSTL_BACKEND_OPENMP": "openmp_pstl_backend"
+    "_LIBCPP_PSTL_BACKEND_OPENMP": "openmp_pstl_backend",
 }
 for macro, feature in macros.items():
     DEFAULT_FEATURES.append(

>From f0e40f1b674c64ff2b6e9c6d320246dc2fc698ff Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Tue, 7 Nov 2023 22:17:09 -0800
Subject: [PATCH 50/65] Added tests for exceptions and automatic mapping of
 function pointers

---
 libcxx/docs/UsingLibcxx.rst                   | 21 +++----
 .../find_if_exception.pass.cpp                | 50 +++++++++++++++++
 .../alg.pstl.offload/find_if_funptr.pass.cpp  | 42 ++++++++++++++
 .../alg.pstl.offload/for_each_funptr.pass.cpp | 42 ++++++++++++++
 .../alg.pstl.offload/for_each_lambda.pass.cpp | 55 +++++++++++++++++++
 .../openmp_exception_warning.verify.cpp       | 46 ++++++++++++++++
 6 files changed, 243 insertions(+), 13 deletions(-)
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_exception.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_funptr.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_funptr.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_lambda.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_exception_warning.verify.cpp

diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index 94e92982bc6519..6773771a26c697 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -531,31 +531,26 @@ space. However, discrete GPU systems have distinct address spaces. A single
 address space can be emulated if your system supports unified shared memory.
 However, many discrete GPU systems do not, and in those cases it is important to
 pass device function pointers to the parallel algorithms. Below is an example of
-how the OpenMP ``declare target`` directive can be used to mark that a function
-should be compiled for both host and device. The device address of a function
-pointer can be obtained with ``target map(from:<list of identifiers>)``.
+how the OpenMP ``declare target`` directive with the ``indirect`` clause can be
+used to mark that a function should be compiled for both host and device.
 
 .. code-block:: cpp
 
-  // Declare that the function must be compiled for both host and device
-  #pragma omp declare target
   // This function computes the squared difference of two floating points
   float squared(float a, float b) { return a * a - 2.0f * a * b + b * b; };
-  #pragma omp end declare target
+
+  // Declare that the function must be compiled for both host and device
+  #pragma omp declare target indirect to(squared)
 
   int main() {
     std::vector<float> a(100, 1.0);
     std::vector<float> b(100, 1.25);
 
-    // Get the device pointer for squared
-    float (*dev_squared)(float, float);
-  #pragma omp target map(from : dev_squared)
-    dev_squared = &squared;
-
-    // Pass the device function pointer to the parallel algorithm
+    // Pass the host function pointer to the parallel algorithm and let OpenMP
+    // translate it to the device function pointer internally
     float sum =
         std::transform_reduce(std::execution::par_unseq, a.begin(), a.end(),
-                              b.begin(), 0.0f, std::plus{}, dev_squared);
+                              b.begin(), 0.0f, std::plus{}, squared);
 
     // Validate that the result is approximately 6.25
     assert(std::abs(sum - 6.25f) < 1e-10);
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_exception.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_exception.pass.cpp
new file mode 100644
index 00000000000000..d066359f9634eb
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_exception.pass.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that we can run code with exceptions on the device, as
+// long as no exception is not thrown.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS:  -Wno-openmp-target-exception -fexceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+bool is_even(int& i) {
+  try {
+    if ((i % 2) == 0) {
+      return true;
+    } else {
+      throw false;
+    }
+  } catch (bool b) {
+    return b;
+  }
+}
+#pragma omp declare target indirect to(is_even)
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  const int test_size = 10000;
+  std::vector<int> v(test_size, 2);
+
+  // Providing for_each a function pointer
+  auto idx = std::find_if(std::execution::par_unseq, v.begin(), v.end(), is_even);
+
+  assert(idx == v.begin() && "std::find_if(std::execution::par_unseq,...) does not support exception handling.");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_funptr.pass.cpp
new file mode 100644
index 00000000000000..69e6ad9ea1de1e
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_funptr.pass.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that we can provide function pointers as input to
+// std::find_if. The OpenMP declare target directive with the `indirect` clause
+// makes an implicit mapping of the host and device function pointers.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+bool is_odd(int& i) { return (i % 2) == 1; }
+#pragma omp declare target indirect to(is_odd)
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  const int test_size = 10000;
+  std::vector<int> v(test_size, 2.0);
+  v[123] = 3;
+
+  // Providing for_each a function pointer
+  auto idx = std::find_if(std::execution::par_unseq, v.begin(), v.end(), is_odd);
+
+  assert(idx - v.begin() == 123 && "std::find_if(std::execution::par_unseq,...) does not accept function pointers");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_funptr.pass.cpp
new file mode 100644
index 00000000000000..6769e61d0ef41f
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_funptr.pass.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that we can provide function pointers as input to
+// std::for_each. The OpenMP declare target directive with the `indirect` clause
+// makes an implicit mapping of the host and device function pointers.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+void cube(double& d) { d *= d * d; }
+#pragma omp declare target indirect to(cube)
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  const int test_size = 10000;
+  std::vector<double> v(test_size, 2.0);
+
+  // Providing for_each a function pointer
+  std::for_each(std::execution::par_unseq, v.begin(), v.end(), cube);
+
+  for (int vi : v)
+    assert(vi == 8 && "std::for_each(std::execution::par_unseq,...) does not accept function pointers");
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_lambda.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_lambda.pass.cpp
new file mode 100644
index 00000000000000..7b1b5a88b3f08d
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_lambda.pass.cpp
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that we can provide a lambda as input to std::for_each in
+// different ways.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+template <class Function, class Tp>
+void test_lambda(Function fun, Tp initial_value, Tp final_value) {
+  const int test_size = 10000;
+  std::vector<double> v(test_size, initial_value);
+
+  // Providing for_each a function pointer
+  std::for_each(std::execution::par_unseq, v.begin(), v.end(), fun);
+
+  for (int vi : v)
+    assert(vi == final_value && "std::for_each(std::execution::par_unseq,...) does not accept lambdas");
+}
+
+int main(int, char**) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Capturing by reference
+  auto cube_ref = [&](double& a) { a *= a * a; };
+  test_lambda(cube_ref, 2.0, 8.0);
+
+  // Capturing by value
+  auto cube_val = [=](double& a) { a *= a * a; };
+  test_lambda(cube_val, 2.0, 8.0);
+
+  // Capturing by reference when using additional input
+  double c1       = 1.0;
+  auto cube_ref_2 = [&](double& a) { a = a * a * a + c1; };
+#pragma omp target data map(to : c1)
+  test_lambda(cube_ref_2, 2.0, 9.0);
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_exception_warning.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_exception_warning.verify.cpp
new file mode 100644
index 00000000000000..48b246cad7f4ab
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_exception_warning.verify.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test validates that the user is prompted with a warning if they use
+// exception handling inside a function called in an offloaded parallel
+// algorithm.
+
+// This test must be compiled with --offload-device-only to avoid the verify
+// expects warnings for the host fall-back code.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -Wopenmp-target-exception -fexceptions --offload-arch=native --offload-device-only -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <execution>
+#include <vector>
+
+bool is_odd(int& i) {
+  try { // expected-warning {{target 'amdgcn-amd-amdhsa' does not support exception handling; 'catch' block is ignored}}
+    if (i % 2 == 0) {
+      return true;
+    } else {
+      throw false; // expected-warning {{target 'amdgcn-amd-amdhsa' does not support exception handling; 'throw' is assumed to be never reached}}
+    }
+  } catch (bool b) {
+    return b;
+  }
+}
+#pragma omp declare target indirect to(is_odd)
+
+int main(int, char**) {
+  const int test_size = 10000;
+  std::vector<int> v(test_size, 2.0);
+
+  // Providing find_if a function pointer
+  std::find_if(std::execution::par_unseq, v.begin(), v.end(), is_odd);
+  return 0;
+}
\ No newline at end of file

>From 56cd8d698db57c4ebc65fac0bdf1c5e4d41e0a2d Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 10 Nov 2023 12:17:02 -0800
Subject: [PATCH 51/65] Removed amdgcn-amd-amdhsa requirement from test

---
 .../alg.pstl.offload/openmp_exception_warning.verify.cpp      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_exception_warning.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_exception_warning.verify.cpp
index 48b246cad7f4ab..a93f74db5e02f3 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_exception_warning.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_exception_warning.verify.cpp
@@ -24,11 +24,11 @@
 #include <vector>
 
 bool is_odd(int& i) {
-  try { // expected-warning {{target 'amdgcn-amd-amdhsa' does not support exception handling; 'catch' block is ignored}}
+  try { // expected-warning {{does not support exception handling; 'catch' block is ignored}}
     if (i % 2 == 0) {
       return true;
     } else {
-      throw false; // expected-warning {{target 'amdgcn-amd-amdhsa' does not support exception handling; 'throw' is assumed to be never reached}}
+      throw false; // expected-warning {{does not support exception handling; 'throw' is assumed to be never reached}}
     }
   } catch (bool b) {
     return b;

>From af9b27579f0db4ea03f47ee3a5ae0b46260eb865 Mon Sep 17 00:00:00 2001
From: Anton Rydahl <rydahlanton at gmail.com>
Date: Tue, 26 Mar 2024 21:58:14 +0000
Subject: [PATCH 52/65] Clang-formatted PSTL offload test and added new files
 to libcxx/include/libcxx.imp

---
 libcxx/include/libcxx.imp                            | 11 +++++++++++
 .../for_each_overwrite_input.pass.cpp                | 12 ++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/libcxx/include/libcxx.imp b/libcxx/include/libcxx.imp
index 56ea58262828a0..d80899217fabdd 100644
--- a/libcxx/include/libcxx.imp
+++ b/libcxx/include/libcxx.imp
@@ -87,6 +87,17 @@
   { include: [ "<__algorithm/pstl_backends/cpu_backends/thread.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_backends/cpu_backends/transform.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_backends/cpu_backends/transform_reduce.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/openmp.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/openmp/any_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/openmp/backend.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/openmp/fill.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/openmp/find_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/openmp/for_each.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/openmp/merge.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/openmp/omp_offload.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/openmp/stable_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/openmp/transform.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/openmp/transform_reduce.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_copy.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_count.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_equal.h>", "private", "<algorithm>", "public" ] },
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index 7bed8f7e80c5e2..8a0743d600a161 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -42,28 +42,24 @@ int main(int, char**) {
   // Testing with vector of doubles
   {
     std::vector<double> v(test_size, value);
-    overwrite(
-        v, [&](double& n) { n *= n; }, value);
+    overwrite(v, [&](double& n) { n *= n; }, value);
   }
   // Testing with vector of integers
   {
     std::vector<int> v(test_size, (int)value);
-    overwrite(
-        v, [&](int& n) { n *= n; }, (int)value);
+    overwrite(v, [&](int& n) { n *= n; }, (int)value);
   }
   // Testing with array of doubles
   {
     std::array<double, test_size> a;
     a.fill(value);
-    overwrite(
-        a, [&](double& n) { n *= n; }, value);
+    overwrite(a, [&](double& n) { n *= n; }, value);
   }
   // Testing with array of integers
   {
     std::array<int, test_size> a;
     a.fill((int)value);
-    overwrite(
-        a, [&](int& n) { n *= n; }, (int)value);
+    overwrite(a, [&](int& n) { n *= n; }, (int)value);
   }
   return 0;
 }

>From 768c0c8ffd6c07373c07e0797bd072d268d96895 Mon Sep 17 00:00:00 2001
From: Anton Rydahl <rydahlanton at gmail.com>
Date: Wed, 27 Mar 2024 02:20:46 +0000
Subject: [PATCH 53/65] Fix broken tests

---
 .../pstl_backends/openmp/find_if.h            |  1 +
 libcxx/include/module.modulemap               | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index bf569a4e61520a..2586a1ada05e31 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -15,6 +15,7 @@
 #include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__iterator/wrap_iter.h>
+#include <__type_traits/is_execution_policy.h>
 #include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/remove_pointer.h>
 #include <optional>
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 03d18775631ed6..131fe7f82d1605 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -740,6 +740,25 @@ module std_private_algorithm_pstl_backends_cpu_backends_transform        [system
   export std_private_algorithm_transform
 }
 module std_private_algorithm_pstl_backends_cpu_backends_transform_reduce [system] { header "__algorithm/pstl_backends/cpu_backends/transform_reduce.h" }
+module std_private_algorithm_pstl_backends_openmp                        [system] {
+  header "__algorithm/pstl_backends/openmp.h"
+  export *
+}
+module std_private_algorithm_pstl_backends_openmp_any_of                 [system] { header "__algorithm/pstl_backends/openmp/any_of.h" }
+module std_private_algorithm_pstl_backends_openmp_backend                [system] {
+  header "__algorithm/pstl_backends/openmp/backend.h"
+  export *
+}
+module std_private_algorithm_pstl_backends_openmp_fill                   [system] { header "__algorithm/pstl_backends/openmp/fill.h" }
+module std_private_algorithm_pstl_backends_openmp_find_if                [system] { header "__algorithm/pstl_backends/openmp/find_if.h" }
+module std_private_algorithm_pstl_backends_openmp_for_each               [system] { header "__algorithm/pstl_backends/openmp/for_each.h" }
+module std_private_algorithm_pstl_backends_openmp_merge                  [system] { header "__algorithm/pstl_backends/openmp/merge.h" }
+module std_private_algorithm_pstl_backends_openmp_omp_offload            [system] { header "__algorithm/pstl_backends/openmp/omp_offload.h" }
+module std_private_algorithm_pstl_backends_openmp_stable_sort            [system] { header "__algorithm/pstl_backends/openmp/stable_sort.h" }
+module std_private_algorithm_pstl_backends_openmp_transform              [system] {
+  header "__algorithm/pstl_backends/openmp/transform.h"
+}
+module std_private_algorithm_pstl_backends_openmp_transform_reduce       [system] { header "__algorithm/pstl_backends/openmp/transform_reduce.h" }
 module std_private_algorithm_pstl_copy                                   [system] { header "__algorithm/pstl_copy.h" }
 module std_private_algorithm_pstl_count                                  [system] { header "__algorithm/pstl_count.h" }
 module std_private_algorithm_pstl_equal                                  [system] { header "__algorithm/pstl_equal.h" }

>From 9667baf3fd402d417dee649902d80aed281b2541 Mon Sep 17 00:00:00 2001
From: Anton Rydahl <rydahlanton at gmail.com>
Date: Wed, 27 Mar 2024 13:56:45 +0000
Subject: [PATCH 54/65] Pushing and popping macros to avoid replacing reduction
 operation in OpenMP pragmas

---
 libcxx/include/__algorithm/pstl_backends/openmp/find_if.h   | 5 +++++
 .../__algorithm/pstl_backends/openmp/transform_reduce.h     | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 2586a1ada05e31..3eb7907faf0237 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -24,6 +24,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -62,4 +65,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FIND_IF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index de617423899a07..4f0b7a80a4e12a 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -21,12 +21,16 @@
 #include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/operation_traits.h>
 #include <__type_traits/remove_pointer.h>
+#include <__utility/move.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
@@ -186,4 +190,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H

>From c0e1e1ab989217058779a91d4e601a6f17f41aa6 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Wed, 27 Mar 2024 14:07:11 -0400
Subject: [PATCH 55/65] Fix a few comments in the tests

---
 .../{alg.pstl.offload => alg.pstl.openmp}/fill_offload.pass.cpp | 0
 .../{alg.pstl.offload => alg.pstl.openmp}/find_if.pass.cpp      | 0
 .../find_if_exception.pass.cpp                                  | 0
 .../find_if_funptr.pass.cpp                                     | 0
 .../find_if_offload.pass.cpp                                    | 0
 .../for_each_funptr.pass.cpp                                    | 0
 .../for_each_lambda.pass.cpp                                    | 0
 .../for_each_offload.pass.cpp                                   | 0
 .../for_each_overwrite_input.pass.cpp                           | 0
 .../gpu_environment_variables.pass.cpp}                         | 2 +-
 .../openmp_exception_warning.verify.cpp                         | 2 +-
 .../openmp_version_40.verify.cpp                                | 0
 .../openmp_version_45.verify.cpp                                | 0
 .../openmp_version_51.verify.cpp                                | 0
 .../transform_offload.pass.cpp                                  | 0
 .../transform_reduce_offload.pass.cpp                           | 0
 .../transform_reduce_supported_binary_operations.pass.cpp       | 0
 17 files changed, 2 insertions(+), 2 deletions(-)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/fill_offload.pass.cpp (100%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/find_if.pass.cpp (100%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/find_if_exception.pass.cpp (100%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/find_if_funptr.pass.cpp (100%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/find_if_offload.pass.cpp (100%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/for_each_funptr.pass.cpp (100%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/for_each_lambda.pass.cpp (100%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/for_each_offload.pass.cpp (100%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/for_each_overwrite_input.pass.cpp (100%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload/gpu_environemt_variables.pass.cpp => alg.pstl.openmp/gpu_environment_variables.pass.cpp} (99%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/openmp_exception_warning.verify.cpp (99%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/openmp_version_40.verify.cpp (100%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/openmp_version_45.verify.cpp (100%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/openmp_version_51.verify.cpp (100%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/transform_offload.pass.cpp (100%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/transform_reduce_offload.pass.cpp (100%)
 rename libcxx/test/libcxx/algorithms/{alg.pstl.offload => alg.pstl.openmp}/transform_reduce_supported_binary_operations.pass.cpp (100%)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_exception.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_exception.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_exception.pass.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_exception.pass.cpp
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_funptr.pass.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_funptr.pass.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_lambda.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_lambda.pass.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
similarity index 99%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
index 71d400ea4ccef6..251c39b98cde98 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
@@ -48,4 +48,4 @@ int main(int, char**) {
         (cuda_visible_devices.empty() || (omp_get_num_devices() > 0)) &&
         "CUDA_VISIBLE_DEVICES was set but no devices were detected by OpenMP. The libc++ test suite is misconfigured.");
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_exception_warning.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_exception_warning.verify.cpp
similarity index 99%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_exception_warning.verify.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_exception_warning.verify.cpp
index a93f74db5e02f3..1b0eaddc999597 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_exception_warning.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_exception_warning.verify.cpp
@@ -43,4 +43,4 @@ int main(int, char**) {
   // Providing find_if a function pointer
   std::find_if(std::execution::par_unseq, v.begin(), v.end(), is_odd);
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
rename to libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp

>From e2a49d2c033a1dc5629900ab0d7a1fc2bb416df5 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Wed, 27 Mar 2024 14:48:35 -0400
Subject: [PATCH 56/65] [libc++] Remove test file that doesn't belong in libc++

---
 .../find_if_exception.pass.cpp                | 50 -------------------
 1 file changed, 50 deletions(-)
 delete mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_exception.pass.cpp

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_exception.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_exception.pass.cpp
deleted file mode 100644
index d066359f9634eb..00000000000000
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_exception.pass.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// This test verifies that we can run code with exceptions on the device, as
-// long as no exception is not thrown.
-
-// UNSUPPORTED: c++03, c++11, c++14, gcc
-
-// ADDITIONAL_COMPILE_FLAGS:  -Wno-openmp-target-exception -fexceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
-
-// REQUIRES: openmp_pstl_backend
-
-#include <algorithm>
-#include <cassert>
-#include <execution>
-#include <vector>
-#include <omp.h>
-
-bool is_even(int& i) {
-  try {
-    if ((i % 2) == 0) {
-      return true;
-    } else {
-      throw false;
-    }
-  } catch (bool b) {
-    return b;
-  }
-}
-#pragma omp declare target indirect to(is_even)
-
-int main(int, char**) {
-  // We only run the test if a device is detected by OpenMP
-  if (omp_get_num_devices() < 1)
-    return 0;
-
-  const int test_size = 10000;
-  std::vector<int> v(test_size, 2);
-
-  // Providing for_each a function pointer
-  auto idx = std::find_if(std::execution::par_unseq, v.begin(), v.end(), is_even);
-
-  assert(idx == v.begin() && "std::find_if(std::execution::par_unseq,...) does not support exception handling.");
-  return 0;
-}

>From e2f29e9d46376f4f8f43f031d3d35c72571a0ba2 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Wed, 27 Mar 2024 14:48:51 -0400
Subject: [PATCH 57/65] Remove checks for omp_get_num_devices() that don't
 belong

---
 .../libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp   | 4 ----
 .../libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp | 4 ----
 .../algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp       | 4 ----
 .../algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp       | 4 ----
 4 files changed, 16 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
index 905cfd836e945b..834298c2fc2746 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
@@ -23,10 +23,6 @@
 #include <omp.h>
 
 int main(int, char**) {
-  // We only run the test if a device is detected by OpenMP
-  if (omp_get_num_devices() < 1)
-    return 0;
-
   // Initializing test array
   const int test_size = 10000;
   std::vector<int> v(test_size, 2);
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
index 69e6ad9ea1de1e..18aefd49e79ac7 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
@@ -26,10 +26,6 @@ bool is_odd(int& i) { return (i % 2) == 1; }
 #pragma omp declare target indirect to(is_odd)
 
 int main(int, char**) {
-  // We only run the test if a device is detected by OpenMP
-  if (omp_get_num_devices() < 1)
-    return 0;
-
   const int test_size = 10000;
   std::vector<int> v(test_size, 2.0);
   v[123] = 3;
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
index 6769e61d0ef41f..f94ee3069f284a 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
@@ -26,10 +26,6 @@ void cube(double& d) { d *= d * d; }
 #pragma omp declare target indirect to(cube)
 
 int main(int, char**) {
-  // We only run the test if a device is detected by OpenMP
-  if (omp_get_num_devices() < 1)
-    return 0;
-
   const int test_size = 10000;
   std::vector<double> v(test_size, 2.0);
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
index 7b1b5a88b3f08d..d02e71bde7b8ff 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
@@ -34,10 +34,6 @@ void test_lambda(Function fun, Tp initial_value, Tp final_value) {
 }
 
 int main(int, char**) {
-  // We only run the test if a device is detected by OpenMP
-  if (omp_get_num_devices() < 1)
-    return 0;
-
   // Capturing by reference
   auto cube_ref = [&](double& a) { a *= a * a; };
   test_lambda(cube_ref, 2.0, 8.0);

>From 8de5a93860c35423da42b9765194508b7295981b Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Wed, 27 Mar 2024 14:50:33 -0400
Subject: [PATCH 58/65] [libc++] Remove -L options in tests that shouldn't be
 required (in theory at least)

---
 .../libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp     | 2 +-
 libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp  | 2 +-
 .../libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp   | 2 +-
 .../libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp  | 2 +-
 .../libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp  | 2 +-
 .../libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp | 2 +-
 .../alg.pstl.openmp/for_each_overwrite_input.pass.cpp           | 2 +-
 .../alg.pstl.openmp/gpu_environment_variables.pass.cpp          | 2 +-
 .../alg.pstl.openmp/openmp_exception_warning.verify.cpp         | 2 +-
 .../algorithms/alg.pstl.openmp/transform_offload.pass.cpp       | 2 +-
 .../alg.pstl.openmp/transform_reduce_offload.pass.cpp           | 2 +-
 .../transform_reduce_supported_binary_operations.pass.cpp       | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
index 834298c2fc2746..6bcdb9a7ec8e98 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
index 48851596a4aa90..f1d75479e70fe5 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
index 18aefd49e79ac7..52e75d5d2ca9dc 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
index f94ee3069f284a..a4aa144f0bc1e7 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
index d02e71bde7b8ff..ba99e9106cb251 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
@@ -11,7 +11,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
index 942637c6e661be..54f4bfc48690a5 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
index 8a0743d600a161..bc2fd08312714c 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
index 251c39b98cde98..ea59d5005704ca 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
@@ -13,7 +13,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_exception_warning.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_exception_warning.verify.cpp
index 1b0eaddc999597..0869330a3f7d2b 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_exception_warning.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_exception_warning.verify.cpp
@@ -15,7 +15,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -Wopenmp-target-exception -fexceptions --offload-arch=native --offload-device-only -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -Wopenmp-target-exception -fexceptions --offload-arch=native --offload-device-only
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
index 9dd7609bb5b9b2..59d47af8f1fa3e 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
index f10efbce19c28c..77fca7268250c6 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
index 594b4a4fcf0bef..11b0555b9f98f3 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
@@ -20,7 +20,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 

>From 0f34b3a28c784002b53e49109ff91feb9760f12a Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Wed, 27 Mar 2024 14:51:00 -0400
Subject: [PATCH 59/65] [libc++] Fix typo in comment

---
 .../transform_reduce_supported_binary_operations.pass.cpp       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
index 11b0555b9f98f3..4c30bbe4c63c06 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
@@ -133,7 +133,7 @@ int main(int, char**) {
   }
 
   //===--------------------------------------------------------------------===//
-  // Birwise binary operators
+  // Bitwise binary operators
   //===--------------------------------------------------------------------===//
 
   // Bitwise and

>From 336171a71a9d4a0a672bd40a95e2151ee982bc2c Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Wed, 27 Mar 2024 14:52:29 -0400
Subject: [PATCH 60/65] fixup! [libc++] Remove -L options in tests that
 shouldn't be required (in theory at least)

---
 .../libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
index 538d71dbfa0fa3..3b13a412e4aec1 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget  -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 

>From f08c2c6abc0109c3856e303f519fd87a23d4632f Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Wed, 27 Mar 2024 14:54:00 -0400
Subject: [PATCH 61/65] [libc++] Remove some explicit -fno-exceptions from the
 tests

---
 .../libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp     | 2 +-
 libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp  | 2 +-
 .../libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp   | 2 +-
 .../libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp  | 2 +-
 .../libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp  | 2 +-
 .../libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp  | 2 +-
 .../libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp | 2 +-
 .../alg.pstl.openmp/for_each_overwrite_input.pass.cpp           | 2 +-
 .../alg.pstl.openmp/gpu_environment_variables.pass.cpp          | 2 +-
 .../algorithms/alg.pstl.openmp/transform_offload.pass.cpp       | 2 +-
 .../alg.pstl.openmp/transform_reduce_offload.pass.cpp           | 2 +-
 .../transform_reduce_supported_binary_operations.pass.cpp       | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
index 6bcdb9a7ec8e98..47c4c6bae5f392 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
index f1d75479e70fe5..fbe5efd37aeb7d 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
index 52e75d5d2ca9dc..061c063d840ac2 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
index 3b13a412e4aec1..dbdc06057d0147 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
index a4aa144f0bc1e7..a37384d646b482 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
index ba99e9106cb251..1ee8ac70758b94 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
@@ -11,7 +11,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
index 54f4bfc48690a5..1349dde74ea62e 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
index bc2fd08312714c..5e0a2aec3ef83d 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
index ea59d5005704ca..e04fb3d216459d 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
@@ -13,7 +13,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
index 59d47af8f1fa3e..7043bb63559746 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
index 77fca7268250c6..2473634bbac04d 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
index 4c30bbe4c63c06..120aa774e7391e 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
@@ -20,7 +20,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fno-exceptions --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
 // REQUIRES: openmp_pstl_backend
 

>From a43631a14f53492bf19834005c47cfc9329e5ac2 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Wed, 27 Mar 2024 14:58:46 -0400
Subject: [PATCH 62/65] [libc++] Fix what Lit features we define

---
 libcxx/test/CMakeLists.txt                     |  8 --------
 .../alg.pstl.openmp/fill_offload.pass.cpp      |  2 +-
 .../alg.pstl.openmp/find_if.pass.cpp           |  2 +-
 .../alg.pstl.openmp/find_if_funptr.pass.cpp    |  2 +-
 .../alg.pstl.openmp/find_if_offload.pass.cpp   |  2 +-
 .../alg.pstl.openmp/for_each_funptr.pass.cpp   |  2 +-
 .../alg.pstl.openmp/for_each_lambda.pass.cpp   |  2 +-
 .../alg.pstl.openmp/for_each_offload.pass.cpp  |  2 +-
 .../for_each_overwrite_input.pass.cpp          |  2 +-
 .../gpu_environment_variables.pass.cpp         |  2 +-
 .../openmp_exception_warning.verify.cpp        |  2 +-
 .../openmp_version_40.verify.cpp               |  2 +-
 .../openmp_version_45.verify.cpp               |  2 +-
 .../openmp_version_51.verify.cpp               |  2 +-
 .../alg.pstl.openmp/transform_offload.pass.cpp |  2 +-
 .../transform_reduce_offload.pass.cpp          |  2 +-
 ...reduce_supported_binary_operations.pass.cpp |  2 +-
 libcxx/utils/libcxx/test/features.py           | 18 +++++++++++++++++-
 libcxx/utils/libcxx/test/params.py             | 18 ------------------
 19 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index b0233fc7d1f358..e0d3a0dbc40031 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -30,14 +30,6 @@ if (LLVM_USE_SANITIZER)
   serialize_lit_string_param(SERIALIZED_LIT_PARAMS use_sanitizer "${LLVM_USE_SANITIZER}")
 endif()
 
-# If the OpenMP PSTL backend was enabled, the OpenMP compilation toolchain must
-# also be enabled for the LIT tests
-if (DEFINED LIBCXX_PSTL_BACKEND)
-  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
-    serialize_lit_string_param(SERIALIZED_LIT_PARAMS openmp_pstl_backend "ON")
-  endif()
-endif()
-
 serialize_lit_params_list(SERIALIZED_LIT_PARAMS LIBCXX_TEST_PARAMS)
 
 if (NOT DEFINED LIBCXX_TEST_DEPS)
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
index 47c4c6bae5f392..8e82129cfbd449 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/fill_offload.pass.cpp
@@ -14,7 +14,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
index fbe5efd37aeb7d..0d0dad3a28a216 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if.pass.cpp
@@ -14,7 +14,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 #include <array>
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
index 061c063d840ac2..b5909ffcd001a3 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_funptr.pass.cpp
@@ -14,7 +14,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
index dbdc06057d0147..2c74b6099afe6f 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/find_if_offload.pass.cpp
@@ -14,7 +14,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
index a37384d646b482..fd15120fd22450 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_funptr.pass.cpp
@@ -14,7 +14,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
index 1ee8ac70758b94..c44942b071614a 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_lambda.pass.cpp
@@ -13,7 +13,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
index 1349dde74ea62e..19e4f61bb87e17 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_offload.pass.cpp
@@ -14,7 +14,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
index 5e0a2aec3ef83d..29cb57e99b7a46 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/for_each_overwrite_input.pass.cpp
@@ -14,7 +14,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 #include <array>
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
index e04fb3d216459d..f64f876ac9e905 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/gpu_environment_variables.pass.cpp
@@ -15,7 +15,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <string>
 #include <cassert>
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_exception_warning.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_exception_warning.verify.cpp
index 0869330a3f7d2b..a5cbeaf22e8d01 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_exception_warning.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_exception_warning.verify.cpp
@@ -17,7 +17,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -Wopenmp-target-exception -fexceptions --offload-arch=native --offload-device-only
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 #include <execution>
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp
index 77836d47b08120..85b2656788cf35 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_40.verify.cpp
@@ -14,7 +14,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=40
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp
index ff0bf1cf67c8e8..4765ca0d540b34 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_45.verify.cpp
@@ -14,7 +14,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=45
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp
index 401285586caee2..b7836cb9425486 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/openmp_version_51.verify.cpp
@@ -14,7 +14,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=51
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
index 7043bb63559746..c664aa4f35394f 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_offload.pass.cpp
@@ -14,7 +14,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
index 2473634bbac04d..035f0adc23a51b 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_offload.pass.cpp
@@ -14,7 +14,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
index 120aa774e7391e..b8d02b471e90ab 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.openmp/transform_reduce_supported_binary_operations.pass.cpp
@@ -22,7 +22,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: --offload-arch=native
 
-// REQUIRES: openmp_pstl_backend
+// REQUIRES: libcpp-pstl-backend-openmp
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 9f0ad063d7fb77..d92b8c26a082a2 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -313,7 +313,6 @@ def _getAndroidDeviceApi(cfg):
     "_LIBCPP_HAS_NO_TIME_ZONE_DATABASE": "no-tzdb",
     "_LIBCPP_HAS_NO_UNICODE": "libcpp-has-no-unicode",
     "_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH": "libcpp-pstl-cpu-backend-libdispatch",
-    "_LIBCPP_PSTL_BACKEND_OPENMP": "openmp_pstl_backend",
 }
 for macro, feature in macros.items():
     DEFAULT_FEATURES.append(
@@ -323,6 +322,23 @@ def _getAndroidDeviceApi(cfg):
         )
     )
 
+DEFAULT_FEATURES.append(
+    Feature(
+        name="libcpp-pstl-backend-openmp",
+        when=lambda cfg: '_LIBCPP_PSTL_BACKEND_OPENMP' in compilerMacros(cfg),
+        actions=[
+            AddCompileFlag("-fopenmp"),
+            # The linker needs to find the correct version of libomptarget
+            AddLinkFlag("-Wl,-rpath,%{lib}/../../lib"),
+            # The preprocessor needs to find the omp.h header. If OpenMP was
+            # installed as a project, the header lives in the following
+            # directory
+            AddFlag("-I %{lib}/../../projects/openmp/runtime/src/"),
+            # And if it was installed as a runtime it lives in the following:
+            AddFlag("-I %{lib}/../../runtimes/runtimes-bins/openmp/runtime/src"),
+        ]
+    )
+)
 
 # Mapping from canonical locale names (used in the tests) to possible locale
 # names on various systems. Each locale is considered supported if any of the
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index 4bd4c6f11dc341..5e42562ed5db52 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -408,23 +408,5 @@ def getSuitableClangTidy(cfg):
             AddSubstitution('%{clang-tidy}', exe),
         ]
     ),
-    Parameter(
-        name="openmp_pstl_backend",
-        choices=[True, False],
-        type=bool,
-        default=False,
-        help="Enable the OpenMP compilation toolchain if the PSTL backend was set to OpenMP.",
-        actions=lambda enabled: [
-            AddCompileFlag("-fopenmp"),
-            # The linker needs to find the correct version of libomptarget
-            AddLinkFlag("-Wl,-rpath,%{lib}/../../lib"),
-            # The preprocessor needs to find the omp.h header. If OpenMP was 
-            # installed as a project, the header lives in the following
-            # directory
-            AddFlag("-I %{lib}/../../projects/openmp/runtime/src/"),
-            # And if it was installed as a runtime it lives in the following:
-            AddFlag("-I %{lib}/../../runtimes/runtimes-bins/openmp/runtime/src"),
-        ] if enabled else [],
-    ),
 ]
 # fmt: on

>From 7e111a99e262ff667294d7cf62b31e1f7985baf7 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Wed, 27 Mar 2024 15:02:29 -0400
Subject: [PATCH 63/65] [libc++] Rename implementation-detail macros to
 _LIBCPP_FOO

---
 .../pstl_backends/openmp/transform_reduce.h   | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index 4f0b7a80a4e12a..437ea2771874a4 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -39,7 +39,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // Templates for predefined reductions
 //===----------------------------------------------------------------------===//
 
-#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                  \
+#  define _LIBCPP_PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                            \
     template <class _Iterator,                                                                                         \
               class _DifferenceType,                                                                                   \
               typename _Tp,                                                                                            \
@@ -59,7 +59,7 @@ _PSTL_PRAGMA(omp target teams distribute parallel for reduction(omp_op:__init))
       return __init;                                                                                                   \
     }
 
-#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                  \
+#  define _LIBCPP_PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                            \
     template <class _Iterator1,                                                                                        \
               class _Iterator2,                                                                                        \
               class _DifferenceType,                                                                                   \
@@ -83,33 +83,33 @@ _PSTL_PRAGMA(omp target teams distribute parallel for reduction(omp_op:__init))
       return __init;                                                                                                   \
     }
 
-#  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
-    __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                        \
-    __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
+#  define _LIBCPP_PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                              \
+    _LIBCPP_PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                  \
+    _LIBCPP_PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
 
 // Addition
-__PSTL_OMP_SIMD_REDUCTION(+, std::plus)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(+, std::plus)
 
 // Subtraction
-__PSTL_OMP_SIMD_REDUCTION(-, std::minus)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(-, std::minus)
 
 // Multiplication
-__PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
 
 // Logical and
-__PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
 
 // Logical or
-__PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
 
 // Bitwise and
-__PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
 
 // Bitwise or
-__PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
 
 // Bitwise xor
-__PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
+_LIBCPP_PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
 
 //===----------------------------------------------------------------------===//
 // The following struct is used to determine whether a reduction is supported by
@@ -119,21 +119,21 @@ __PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
 template <class _T1, class _T2, class _T3>
 struct __is_supported_reduction : std::false_type {};
 
-#  define __PSTL_IS_SUPPORTED_REDUCTION(funname)                                                                       \
+#  define _LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(func)                                                                    \
     template <class _Tp>                                                                                               \
-    struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};                                  \
+    struct __is_supported_reduction<func<_Tp>, _Tp, _Tp> : std::true_type {};                                          \
     template <class _Tp, class _Up>                                                                                    \
-    struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+    struct __is_supported_reduction<func<>, _Tp, _Up> : std::true_type {};
 
 // __is_trivial_plus_operation already exists
-__PSTL_IS_SUPPORTED_REDUCTION(plus)
-__PSTL_IS_SUPPORTED_REDUCTION(minus)
-__PSTL_IS_SUPPORTED_REDUCTION(multiplies)
-__PSTL_IS_SUPPORTED_REDUCTION(logical_and)
-__PSTL_IS_SUPPORTED_REDUCTION(logical_or)
-__PSTL_IS_SUPPORTED_REDUCTION(bit_and)
-__PSTL_IS_SUPPORTED_REDUCTION(bit_or)
-__PSTL_IS_SUPPORTED_REDUCTION(bit_xor)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::plus)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::minus)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::multiplies)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::logical_and)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::logical_or)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_and)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_or)
+_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_xor)
 
 //===----------------------------------------------------------------------===//
 // Implementation of PSTL transform_reduce for one and two input iterators

>From dccac49adb108ce6a9a72f463185f31c40a74373 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Wed, 27 Mar 2024 15:03:07 -0400
Subject: [PATCH 64/65] [libc++] Add comment for any_of that shouldnt be needed

---
 libcxx/include/__algorithm/pstl_backends/openmp/any_of.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index ec5b4c4a4c3aa2..c950ea0cac9058 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -19,10 +19,10 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+// TODO: Remove this function once https://github.com/llvm/llvm-project/issues/70718 is fixed.
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI optional<bool>
 __pstl_any_of(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
-  // TODO: Implement GPU backend
   return std::__pstl_any_of<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
 }
 

>From 692c55262701081fb65f9e7c089f7b1ba1ade8ee Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Wed, 27 Mar 2024 15:09:07 -0400
Subject: [PATCH 65/65] [libc++] Add a CI job for the openMP PSTL backend

---
 .github/workflows/libcxx-build-and-test.yaml  | 1 +
 libcxx/cmake/caches/Generic-pstl-openmp.cmake | 1 +
 libcxx/utils/ci/run-buildbot                  | 5 +++++
 3 files changed, 7 insertions(+)
 create mode 100644 libcxx/cmake/caches/Generic-pstl-openmp.cmake

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 4a881ef5ff56af..029015c9d0bce3 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -161,6 +161,7 @@ jobs:
           'generic-no-wide-characters',
           'generic-no-rtti',
           'generic-optimized-speed',
+          'generic-pstl-openmp',
           'generic-static',
           # TODO Find a better place for the benchmark and bootstrapping builds to live. They're either very expensive
           # or don't provide much value since the benchmark run results are too noise on the bots.
diff --git a/libcxx/cmake/caches/Generic-pstl-openmp.cmake b/libcxx/cmake/caches/Generic-pstl-openmp.cmake
new file mode 100644
index 00000000000000..f3ff4f3b57fd21
--- /dev/null
+++ b/libcxx/cmake/caches/Generic-pstl-openmp.cmake
@@ -0,0 +1 @@
+set(LIBCXX_PSTL_BACKEND openmp CACHE STRING "")
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index 2905745355b68e..e8a10409059d2e 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -512,6 +512,11 @@ generic-optimized-speed)
     generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-optimized-speed.cmake"
     check-runtimes
 ;;
+generic-pstl-openmp)
+    clean
+    generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-pstl-openmp.cmake"
+    check-runtimes
+;;
 apple-system)
     clean