[compiler-rt] Adding Separate OpenMP Offloading Backend to `libcxx/include/__algorithm/pstl_backends` (PR #66968)

Fri Oct 20 13:18:24 PDT 2023

https://github.com/AntonRydahl updated https://github.com/llvm/llvm-project/pull/66968

>From b35340e47de896c9933c54ce617538c46cf01488 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 20 Sep 2023 17:06:10 -0700
Subject: [PATCH 01/24] Adding OpenMP Offloading Backend for C++ Parallel
 Algorithms

---
 libcxx/CMakeLists.txt                         | 14 +++
 libcxx/include/CMakeLists.txt                 |  5 +
 libcxx/include/__algorithm/pstl_backend.h     |  8 ++
 .../__algorithm/pstl_backends/gpu_backend.h   | 21 +++++
 .../pstl_backends/gpu_backends/backend.h      | 33 +++++++
 .../pstl_backends/gpu_backends/fill.h         | 59 ++++++++++++
 .../pstl_backends/gpu_backends/for_each.h     | 59 ++++++++++++
 .../pstl_backends/gpu_backends/omp_offload.h  | 91 +++++++++++++++++++
 libcxx/include/__config_site.in               |  1 +
 9 files changed, 291 insertions(+)
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backend.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index bb2898b799bcef9..43d2a448de79584 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -290,6 +290,8 @@ option(LIBCXX_HAS_WIN32_THREAD_API "Ignore auto-detection and force use of win32
 option(LIBCXX_HAS_EXTERNAL_THREAD_API
   "Build libc++ with an externalized threading API.
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
+option(LIBCXX_ENABLE_GPU_OFFLOAD 
+  "Build libc++ with support for GPU offload" OFF)
 
 if (LIBCXX_ENABLE_THREADS)
   set(LIBCXX_PSTL_CPU_BACKEND "std_thread" CACHE STRING "Which PSTL CPU backend to use")
@@ -297,6 +299,14 @@ else()
   set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
 endif()
 
+if (NOT DEFINED LIBCXX_PSTL_GPU_BACKEND)
+  if (${LIBCXX_ENABLE_GPU_OFFLOAD})
+    set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
+  else()
+    set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
+  endif()
+endif()
+
 # Misc options ----------------------------------------------------------------
 # FIXME: Turn -pedantic back ON. It is currently off because it warns
 # about #include_next which is used everywhere.
@@ -809,6 +819,10 @@ else()
                        Valid backends are: serial, std_thread and libdispatch")
 endif()
 
+if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload")
+  config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+endif()
+
 if (LIBCXX_ABI_DEFINES)
   set(abi_defines)
   foreach (abi_define ${LIBCXX_ABI_DEFINES})
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 2ec755236dbaee2..a3d72df61a86dde 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -85,6 +85,11 @@ set(files
   __algorithm/pstl_backends/cpu_backends/thread.h
   __algorithm/pstl_backends/cpu_backends/transform.h
   __algorithm/pstl_backends/cpu_backends/transform_reduce.h
+  __algorithm/pstl_backends/gpu_backend.h
+  __algorithm/pstl_backends/gpu_backends/backend.h
+  __algorithm/pstl_backends/gpu_backends/fill.h
+  __algorithm/pstl_backends/gpu_backends/for_each.h
+  __algorithm/pstl_backends/gpu_backends/omp_offload.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index 93372f019031b63..f051e0ce9be13c3 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKEND_H
 
 #include <__algorithm/pstl_backends/cpu_backend.h>
+#include <__algorithm/pstl_backends/gpu_backend.h>
 #include <__config>
 #include <execution>
 
@@ -179,10 +180,17 @@ struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
+#   if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+template <>
+struct __select_backend<std::execution::parallel_unsequenced_policy> {
+  using type = __gpu_backend_tag;
+};
+#   else
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __cpu_backend_tag;
 };
+#   endif
 
 #  else
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
new file mode 100644
index 000000000000000..46a85f77b5deb99
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+
+#include <__config>
+
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+
+#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#include <__algorithm/pstl_backends/gpu_backends/fill.h>
+#include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#endif
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
new file mode 100644
index 000000000000000..a8b400afbb94d9d
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+
+#include <__config>
+#include <cstddef>
+
+#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#  include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+struct __gpu_backend_tag {};
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
new file mode 100644
index 000000000000000..5603e18a5d2d3fc
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+
+#include <__algorithm/fill.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+#include <stdio.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
+  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // parallel unsequenced, as it is the only execution policy prohibiting throwing
+  // exceptions and allowing SIMD instructions
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+  }
+  // Else if the excution policy is parallel, we execute for_each on the CPU instead
+  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__terminate_on_exception([&] {
+      __par_backend::__parallel_for(
+          __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+            std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{}, __brick_first, __brick_last, __value);
+          });
+    });
+  // Else we execute for_each in serial
+  } else {
+    std::fill(__first, __last, __value);
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
new file mode 100644
index 000000000000000..20486d83863f420
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+
+#include <__algorithm/for_each.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+#include <stdio.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
+  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // parallel unsequenced, as it is the only execution policy prohibiting throwing
+  // exceptions and allowing SIMD instructions
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+  }
+  // Else if the excution policy is parallel, we execute for_each on the CPU instead
+  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__terminate_on_exception([&] {
+      std::__par_backend::__parallel_for(
+          __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+            std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{}, __brick_first, __brick_last, __func);
+          });
+    });
+  // Else we execute for_each in serial
+  } else {
+    std::for_each(__first, __last, __func);
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
new file mode 100644
index 000000000000000..840118dbec5057c
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -0,0 +1,91 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+
+#include <__assert>
+#include <__config>
+#include <__utility/move.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+// In OpenMP, we need to extract the pointer for the underlying data for data
+// structures like std::vector and std::array to be able to map the data to the
+// device.
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(T p) {
+  return p;
+}
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
+  std::pointer_traits<std::__wrap_iter<T>> PT;
+  return PT.to_address(w);
+}
+
+// Applying function or lambda in a loop
+
+template <class _Iterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n])
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__first[__i]);
+
+  return __first + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+  __omp_parallel_for_simd_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
+  return __first + __n;
+}
+
+// Assigning a value in a loop
+
+template <class _Index, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n]) map(to:__value)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __first[__i] = __value;
+
+  return __first + __n;
+}
+
+template <class _Index, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI _Index __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+  __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
+  return __first + __n;
+}
+
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index c85cbcd02c441b9..e0edddce3afc3ff 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -34,6 +34,7 @@
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_SERIAL
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
+#cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD
 
 // Hardening.
 #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT

>From af5ddf7709e44435c3b0b15421aa9cfc24b49e84 Mon Sep 17 00:00:00 2001
From: antonrydahl <rydahl2610 at gmail.com>
Date: Wed, 20 Sep 2023 17:48:25 -0700
Subject: [PATCH 02/24] Clang formatting OpenMP backend for parallel algorithms

---
 libcxx/include/__algorithm/pstl_backend.h           |  6 +++---
 .../include/__algorithm/pstl_backends/gpu_backend.h |  4 ++--
 .../__algorithm/pstl_backends/gpu_backends/fill.h   | 12 ++++++------
 .../pstl_backends/gpu_backends/for_each.h           | 12 ++++++------
 .../pstl_backends/gpu_backends/omp_offload.h        | 13 ++++++++-----
 5 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index f051e0ce9be13c3..47f5191b48517ba 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -180,17 +180,17 @@ struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
-#   if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#    if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __gpu_backend_tag;
 };
-#   else
+#    else
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __cpu_backend_tag;
 };
-#   endif
+#    endif
 
 #  else
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index 46a85f77b5deb99..7237036156a1bf3 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -14,8 +14,8 @@
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 
 #if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
-#include <__algorithm/pstl_backends/gpu_backends/fill.h>
-#include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#  include <__algorithm/pstl_backends/gpu_backends/fill.h>
+#  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
 #endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index 5603e18a5d2d3fc..32926da87e2a083 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
 
 #include <__algorithm/fill.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
@@ -29,16 +29,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
-  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // It is only safe to execute for_each on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
   }
   // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__terminate_on_exception([&] {
       __par_backend::__parallel_for(
           __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
@@ -46,7 +46,7 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
                 __cpu_backend_tag{}, __brick_first, __brick_last, __value);
           });
     });
-  // Else we execute for_each in serial
+    // Else we execute for_each in serial
   } else {
     std::fill(__first, __last, __value);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index 20486d83863f420..14de2af8e4a15c6 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
 
 #include <__algorithm/for_each.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
@@ -29,16 +29,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
-  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // It is only safe to execute for_each on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
   // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__terminate_on_exception([&] {
       std::__par_backend::__parallel_for(
           __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
@@ -46,7 +46,7 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
                 __cpu_backend_tag{}, __brick_first, __brick_last, __func);
           });
     });
-  // Else we execute for_each in serial
+    // Else we execute for_each in serial
   } else {
     std::for_each(__first, __last, __func);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 840118dbec5057c..4baa4e7f65859d1 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -46,8 +46,9 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
 // Applying function or lambda in a loop
 
 template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n])
+_LIBCPP_HIDE_FROM_ABI _Iterator
+__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n])
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
 
@@ -65,8 +66,9 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
 // Assigning a value in a loop
 
 template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
-  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n]) map(to:__value)
+_LIBCPP_HIDE_FROM_ABI _Index
+__omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) map(to : __value)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __first[__i] = __value;
 
@@ -74,7 +76,8 @@ _LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _Diff
 }
 
 template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+_LIBCPP_HIDE_FROM_ABI _Index
+__parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
   __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
   return __first + __n;
 }

>From 57abf3062c4e559fddd6bf173d415212b9f92e43 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 21 Sep 2023 12:50:17 -0700
Subject: [PATCH 03/24] Making PSTL GPU backend depend on CMake options rather
 than command line options

---
 libcxx/CMakeLists.txt                                 | 11 +++++------
 libcxx/include/__algorithm/pstl_backend.h             |  2 +-
 .../include/__algorithm/pstl_backends/gpu_backend.h   |  2 +-
 .../__algorithm/pstl_backends/gpu_backends/backend.h  |  8 ++++++--
 libcxx/include/__config_site.in                       |  1 +
 5 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 43d2a448de79584..7aa47caa1ca335a 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -299,12 +299,10 @@ else()
   set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
 endif()
 
-if (NOT DEFINED LIBCXX_PSTL_GPU_BACKEND)
-  if (${LIBCXX_ENABLE_GPU_OFFLOAD})
-    set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
-  else()
-    set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
-  endif()
+if (${LIBCXX_ENABLE_GPU_OFFLOAD})
+  set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
+else()
+  set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
 endif()
 
 # Misc options ----------------------------------------------------------------
@@ -819,6 +817,7 @@ else()
                        Valid backends are: serial, std_thread and libdispatch")
 endif()
 
+config_define_if(LIBCXX_ENABLE_GPU_OFFLOAD _LIBCPP_PSTL_GPU_OFFLOAD)
 if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload")
   config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
 endif()
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index 47f5191b48517ba..0df8847fd33589a 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -180,7 +180,7 @@ struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
-#    if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#    if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __gpu_backend_tag;
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index 7237036156a1bf3..d2a814b441224a5 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -13,7 +13,7 @@
 
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 
-#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 #  include <__algorithm/pstl_backends/gpu_backends/fill.h>
 #  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
 #endif
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
index a8b400afbb94d9d..a03ad35d8d2ae3e 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
@@ -12,8 +12,12 @@
 #include <__config>
 #include <cstddef>
 
-#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
-#  include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
+#  if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#    include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#  else
+#    error Invalid PSTL GPU backend
+#  endif
 #endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index e0edddce3afc3ff..e7fb4f423079333 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -35,6 +35,7 @@
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
 #cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD
+#cmakedefine _LIBCPP_PSTL_GPU_OFFLOAD
 
 // Hardening.
 #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT

>From 51d9ed5702a46ac604bafbe2f707033639f86706 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 21 Sep 2023 17:07:58 -0700
Subject: [PATCH 04/24] Added OpenMP offloaded version of std::transform

---
 libcxx/include/CMakeLists.txt                 |   1 +
 .../__algorithm/pstl_backends/gpu_backend.h   |   1 +
 .../pstl_backends/gpu_backends/omp_offload.h  | 119 +++++++++++++++++-
 .../pstl_backends/gpu_backends/transform.h    | 117 +++++++++++++++++
 4 files changed, 233 insertions(+), 5 deletions(-)
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index a3d72df61a86dde..66e54cfbf1780ee 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -90,6 +90,7 @@ set(files
   __algorithm/pstl_backends/gpu_backends/fill.h
   __algorithm/pstl_backends/gpu_backends/for_each.h
   __algorithm/pstl_backends/gpu_backends/omp_offload.h
+  __algorithm/pstl_backends/gpu_backends/transform.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index d2a814b441224a5..dac26592dac5c1f 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -16,6 +16,7 @@
 #if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 #  include <__algorithm/pstl_backends/gpu_backends/fill.h>
 #  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#  include <__algorithm/pstl_backends/gpu_backends/transform.h>
 #endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 4baa4e7f65859d1..69221cbb8519233 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -28,6 +28,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
+// Checking if a pointer is in a range
+template <typename T1, typename T2, typename T3>
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1 a, T2 p, T3 b) {
+  return false;
+}
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T* a, T* p, T* b) {
+  return std::less_equal<T*>{}(a, p) && std::less<T*>{}(p, b);
+}
+
 // In OpenMP, we need to extract the pointer for the underlying data for data
 // structures like std::vector and std::array to be able to map the data to the
 // device.
@@ -43,12 +54,16 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
   return PT.to_address(w);
 }
 
+//===----------------------------------------------------------------------===//
+// Templates for one iterator
+//===----------------------------------------------------------------------===//
+
 // Applying function or lambda in a loop
 
 template <class _Iterator, class _DifferenceType, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator
-__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n])
+__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f, const int __device = 0) noexcept {
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
 
@@ -66,9 +81,10 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
 // Assigning a value in a loop
 
 template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index
-__omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
-#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) map(to : __value)
+_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
+    _Index __first, _DifferenceType __n, const _Tp& __value, const int __device = 0) noexcept {
+#  pragma omp target teams distribute parallel for simd map(from : __first[0 : __n]) map(always, to : __value)         \
+      device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __first[__i] = __value;
 
@@ -82,6 +98,99 @@ __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __valu
   return __first + __n;
 }
 
+//===----------------------------------------------------------------------===//
+// Templates for two iterators
+//===----------------------------------------------------------------------===//
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
+    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f, const int __device = 0) noexcept {
+  if ((!std::is_same<_Iterator1, _Iterator2>::value) ||
+      (std::is_same<_Iterator1, _Iterator2>::value &&
+       !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first2, __first1 + __n))) {
+#  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n]) map(from : __first2[0 : __n])      \
+      device(__device)
+    for (_DifferenceType __i = 0; __i < __n; ++__i)
+      __f(__first1[__i], __first2[__i]);
+    return __first1 + __n;
+  }
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first1[0 : __n], __first2[0 : __n])             \
+      device(__device)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__first1[__i], __first2[__i]);
+
+  return __first1 + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1
+__parallel_for_simd_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
+  __omp_parallel_for_simd_2(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __n,
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __f);
+  return __first1 + __n;
+}
+
+//===----------------------------------------------------------------------===//
+// Templates for three iterator
+//===----------------------------------------------------------------------===//
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
+    _Iterator1 __first1,
+    _DifferenceType __n,
+    _Iterator2 __first2,
+    _Iterator3 __first3,
+    _Function __f,
+    const int __device = 0) noexcept {
+  // It may be that __first3 is in the interval [__first1+__n) or [__firt2+__n)
+  // It is, however, undefined behavior to compare two pointers that do not
+  // point to the same object or are not the same type.
+  // If we can prove that __first3 is not in any of the ranges [__first1+__n)
+  // or [__firt2+__n), it is safe to reduce the amount of data copied to and
+  // from the device
+  constexpr bool are_not_same_type =
+      !std::is_same<_Iterator1, _Iterator2>::value && !std::is_same<_Iterator1, _Iterator3>::value;
+  const bool no_overlap_13 =
+      std::is_same<_Iterator1, _Iterator3>::value &&
+      !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first3, __first1 + __n);
+  const bool no_overlap_23 =
+      std::is_same<_Iterator2, _Iterator3>::value &&
+      !__omp_gpu_backend::__omp_in_ptr_range(__first2, __first3, __first2 + __n);
+  if (are_not_same_type || (no_overlap_13 && no_overlap_23)) {
+#  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n], __first2[0 : __n])                 \
+      map(from : __first3[0 : __n]) device(__device)
+    for (_DifferenceType __i = 0; __i < __n; ++__i)
+      __f(__first1[__i], __first2[__i], __first3[__i]);
+    return __first1 + __n;
+  }
+  // In the general case, we have to map all data to and from the device
+#  pragma omp target teams distribute parallel for simd map(                                                           \
+          tofrom : __first1[0 : __n], __first2[0 : __n], __first3[0 : __n]) device(__device)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__first1[__i], __first2[__i], __first3[__i]);
+
+  return __first1 + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
+    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
+  __omp_parallel_for_simd_3(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __n,
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __omp_gpu_backend::__omp_extract_base_ptr(__first3),
+      __f);
+  return __first1 + __n;
+}
+
 } // namespace __omp_gpu_backend
 } // namespace __par_backend
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
new file mode 100644
index 000000000000000..03eba11a3f5f52b
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -0,0 +1,117 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/transform.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__type_traits/remove_cvref.h>
+#include <__utility/terminate_on_exception.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+    __gpu_backend_tag,
+    _ForwardIterator __first,
+    _ForwardIterator __last,
+    _ForwardOutIterator __result,
+    _UnaryOperation __op) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    return std::__par_backend::__parallel_for_simd_2(
+        __first,
+        __last - __first,
+        __result,
+        [&](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
+          __out_value = __op(__in_value);
+        });
+  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    std::__terminate_on_exception([&] {
+      std::__par_backend::__parallel_for(
+          __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+            return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op);
+          });
+    });
+    return __result + (__last - __first);
+  } else {
+    return std::transform(__first, __last, __result, __op);
+  }
+}
+
+template <class _ExecutionPolicy,
+          class _ForwardIterator1,
+          class _ForwardIterator2,
+          class _ForwardOutIterator,
+          class _BinaryOperation,
+          enable_if_t<is_execution_policy_v<__remove_cvref_t<_ExecutionPolicy>>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+    __gpu_backend_tag,
+    _ForwardIterator1 __first1,
+    _ForwardIterator1 __last1,
+    _ForwardIterator2 __first2,
+    _ForwardOutIterator __result,
+    _BinaryOperation __op) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    return std::__par_backend::__parallel_for_simd_3(
+        __first1,
+        __last1 - __first1,
+        __first2,
+        __result,
+        [&](__iter_reference<_ForwardIterator1> __in1,
+            __iter_reference<_ForwardIterator2> __in2,
+            __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); });
+  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    std::__terminate_on_exception([&] {
+      std::__par_backend::__parallel_for(
+          __first1,
+          __last1,
+          [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) {
+            return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{},
+                __brick_first,
+                __brick_last,
+                __first2 + (__brick_first - __first1),
+                __result + (__brick_first - __first1),
+                __op);
+          });
+    });
+    return __result + (__last1 - __first1);
+  } else {
+    return std::transform(__first1, __last1, __first2, __result, __op);
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H

>From 33b61efe005cb12247cdf89ce7b7a4c5ca849f5b Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 22 Sep 2023 11:55:53 -0700
Subject: [PATCH 05/24] Changing lambdas to capture by value in std::transform
 for GPUs

---
 .../__algorithm/pstl_backends/gpu_backends/transform.h    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index 03eba11a3f5f52b..7fcfde44aaaa7a6 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -38,11 +38,13 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    // While the CPU backend captures by reference, [&], that is not valid when
+    // offloading to the GPU. Therefore we must capture by value, [=].
     return std::__par_backend::__parallel_for_simd_2(
         __first,
         __last - __first,
         __result,
-        [&](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
+        [=](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
           __out_value = __op(__in_value);
         });
   } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
@@ -78,12 +80,14 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    // While the CPU backend captures by reference, [&], that is not valid when
+    // offloading to the GPU. Therefore we must capture by value, [=].
     return std::__par_backend::__parallel_for_simd_3(
         __first1,
         __last1 - __first1,
         __first2,
         __result,
-        [&](__iter_reference<_ForwardIterator1> __in1,
+        [=](__iter_reference<_ForwardIterator1> __in1,
             __iter_reference<_ForwardIterator2> __in2,
             __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); });
   } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&

>From 065f52d742a4e490b9b0441b9d997fe8e4268a8a Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 25 Sep 2023 13:13:39 -0700
Subject: [PATCH 06/24] GPU Offloading Implementation of std::transform_reduce

---
 libcxx/include/CMakeLists.txt                 |   1 +
 .../__algorithm/pstl_backends/gpu_backend.h   |   1 +
 .../pstl_backends/gpu_backends/omp_offload.h  | 113 ++++++++++++++
 .../gpu_backends/transform_reduce.h           | 147 ++++++++++++++++++
 4 files changed, 262 insertions(+)
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 66e54cfbf1780ee..ea00d3fdaea2924 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -91,6 +91,7 @@ set(files
   __algorithm/pstl_backends/gpu_backends/for_each.h
   __algorithm/pstl_backends/gpu_backends/omp_offload.h
   __algorithm/pstl_backends/gpu_backends/transform.h
+  __algorithm/pstl_backends/gpu_backends/transform_reduce.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index dac26592dac5c1f..ea7f39dea905474 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -17,6 +17,7 @@
 #  include <__algorithm/pstl_backends/gpu_backends/fill.h>
 #  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
 #  include <__algorithm/pstl_backends/gpu_backends/transform.h>
+#  include <__algorithm/pstl_backends/gpu_backends/transform_reduce.h>
 #endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 69221cbb8519233..d1cc6133f8e0876 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -191,6 +191,119 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
   return __first1 + __n;
 }
 
+//===----------------------------------------------------------------------===//
+// Templates for reductions
+//===----------------------------------------------------------------------===//
+
+// General case
+
+#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                            \
+    template <class _Iterator,                                                                                                   \
+              class _DifferenceType,                                                                                             \
+              typename _Tp,                                                                                                      \
+              typename _BinaryOperationType,                                                                                     \
+              typename _UnaryOperation,                                                                                          \
+              __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 >                                                                     \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                               \
+        _Iterator __first,                                                                                                       \
+        _DifferenceType __n,                                                                                                     \
+        _Tp __init,                                                                                                              \
+        std_op<_BinaryOperationType> __reduce,                                                                                   \
+        _UnaryOperation __transform,                                                                                             \
+        const int __device = 0) noexcept {                                                                                       \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n]) device(__device)) \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                            \
+        __init = __reduce(__init, __transform(__first[__i]));                                                                    \
+      return __init;                                                                                                             \
+    }
+
+#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                                                \
+    template <class _Iterator1,                                                                                                                      \
+              class _Iterator2,                                                                                                                      \
+              class _DifferenceType,                                                                                                                 \
+              typename _Tp,                                                                                                                          \
+              typename _BinaryOperationType,                                                                                                         \
+              typename _UnaryOperation,                                                                                                              \
+              __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 >                                                                                         \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                                                   \
+        _Iterator1 __first1,                                                                                                                         \
+        _Iterator2 __first2,                                                                                                                         \
+        _DifferenceType __n,                                                                                                                         \
+        _Tp __init,                                                                                                                                  \
+        std_op<_BinaryOperationType> __reduce,                                                                                                       \
+        _UnaryOperation __transform,                                                                                                                 \
+        const int __device = 0) noexcept {                                                                                                           \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]) device(__device)) \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                                \
+        __init = __reduce(__init, __transform(__first1[__i], __first2[__i]));                                                                        \
+      return __init;                                                                                                                                 \
+    }
+
+#  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
+    __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                        \
+    __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
+
+// Addition
+__PSTL_OMP_SIMD_REDUCTION(+, std::plus)
+
+// Subtraction
+__PSTL_OMP_SIMD_REDUCTION(-, std::minus)
+
+// Multiplication
+__PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
+
+// Logical and
+__PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
+
+// Logical or
+__PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
+
+// Bitwise and
+__PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
+
+// Bitwise or
+__PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
+
+// Bitwise xor
+__PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
+
+// Extracting the underlying pointers
+
+template <class _Iterator, class _DifferenceType, typename _Tp, typename _BinaryOperation, typename _UnaryOperation >
+_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
+    _Iterator __first,
+    _DifferenceType __n,
+    _Tp __init,
+    _BinaryOperation __reduce,
+    _UnaryOperation __transform,
+    const int __device = 0) noexcept {
+  return __omp_parallel_for_simd_reduction_1(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
+}
+
+template <class _Iterator1,
+          class _Iterator2,
+          class _DifferenceType,
+          typename _Tp,
+          typename _BinaryOperation,
+          typename _UnaryOperation >
+_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_2(
+    _Iterator1 __first1,
+    _Iterator2 __first2,
+    _DifferenceType __n,
+    _Tp __init,
+    _BinaryOperation __reduce,
+    _UnaryOperation __transform,
+    const int __device = 0) noexcept {
+  return __omp_parallel_for_simd_reduction_2(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __n,
+      __init,
+      __reduce,
+      __transform);
+}
+
 } // namespace __omp_gpu_backend
 } // namespace __par_backend
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
new file mode 100644
index 000000000000000..43e5631aef04afb
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -0,0 +1,147 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
+
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__numeric/transform_reduce.h>
+#include <__type_traits/is_arithmetic.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__type_traits/operation_traits.h>
+#include <__utility/move.h>
+#include <__utility/terminate_on_exception.h>
+#include <new>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+//===----------------------------------------------------------------------===//
+// Two input iterators
+//===----------------------------------------------------------------------===//
+
+template <class _ExecutionPolicy,
+          class _ForwardIterator1,
+          class _ForwardIterator2,
+          class _Tp,
+          class _BinaryOperation1,
+          class _BinaryOperation2>
+_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
+    __gpu_backend_tag,
+    _ForwardIterator1 __first1,
+    _ForwardIterator1 __last1,
+    _ForwardIterator2 __first2,
+    _Tp __init,
+    _BinaryOperation1 __reduce,
+    _BinaryOperation2 __transform) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
+    return std::__par_backend::__parallel_for_simd_reduction_2(
+        std::move(__first1),
+        std::move(__first2),
+        __last1 - __first1,
+        std::move(__init),
+        std::move(__reduce),
+        [=](__iter_reference<_ForwardIterator1> __in_value_1, __iter_reference<_ForwardIterator1> __in_value_2) {
+          return __transform(__in_value_1, __in_value_2);
+        });
+  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
+    return std::__terminate_on_exception([&] {
+      return __par_backend::__parallel_transform_reduce(
+          __first1,
+          std::move(__last1),
+          [__first1, __first2, __transform](_ForwardIterator1 __iter) {
+            return __transform(*__iter, *(__first2 + (__iter - __first1)));
+          },
+          std::move(__init),
+          std::move(__reduce),
+          [__first1, __first2, __reduce, __transform](
+              _ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last, _Tp __brick_init) {
+            return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{},
+                __brick_first,
+                std::move(__brick_last),
+                __first2 + (__brick_first - __first1),
+                std::move(__brick_init),
+                std::move(__reduce),
+                std::move(__transform));
+          });
+    });
+  } else {
+    return std::transform_reduce(
+        std::move(__first1),
+        std::move(__last1),
+        std::move(__first2),
+        std::move(__init),
+        std::move(__reduce),
+        std::move(__transform));
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// One input iterator
+//===----------------------------------------------------------------------===//
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
+_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
+    __gpu_backend_tag,
+    _ForwardIterator __first,
+    _ForwardIterator __last,
+    _Tp __init,
+    _BinaryOperation __reduce,
+    _UnaryOperation __transform) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    return std::__par_backend::__parallel_for_simd_reduction_1(
+        std::move(__first),
+        __last - __first,
+        std::move(__init),
+        std::move(__reduce),
+        [=](__iter_reference<_ForwardIterator> __in_value) { return __transform(__in_value); });
+  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    return std::__terminate_on_exception([&] {
+      return __par_backend::__parallel_transform_reduce(
+          std::move(__first),
+          std::move(__last),
+          [__transform](_ForwardIterator __iter) { return __transform(*__iter); },
+          std::move(__init),
+          __reduce,
+          [__transform, __reduce](auto __brick_first, auto __brick_last, _Tp __brick_init) {
+            return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{},
+                std::move(__brick_first),
+                std::move(__brick_last),
+                std::move(__brick_init),
+                std::move(__reduce),
+                std::move(__transform));
+          });
+    });
+  } else {
+    return std::transform_reduce(
+        std::move(__first), std::move(__last), std::move(__init), std::move(__reduce), std::move(__transform));
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H

>From 67ecdee8fc6fcf94873486858445cf1a7b37c7dc Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Tue, 26 Sep 2023 19:55:12 -0700
Subject: [PATCH 07/24] Fixed almost all test cases that failed during ninja
 check-cxx

---
 libcxx/include/CMakeLists.txt                 |   4 +
 .../__algorithm/pstl_backends/gpu_backend.h   |   4 +
 .../pstl_backends/gpu_backends/any_of.h       |  41 +++++++
 .../pstl_backends/gpu_backends/fill.h         |  20 +---
 .../pstl_backends/gpu_backends/find_if.h      |  44 ++++++++
 .../pstl_backends/gpu_backends/for_each.h     |  17 +--
 .../pstl_backends/gpu_backends/merge.h        |  51 +++++++++
 .../pstl_backends/gpu_backends/omp_offload.h  |  94 +++++++++-------
 .../pstl_backends/gpu_backends/stable_sort.h  |  38 +++++++
 .../pstl_backends/gpu_backends/transform.h    |  61 ++--------
 .../gpu_backends/transform_reduce.h           | 105 ++++++------------
 11 files changed, 292 insertions(+), 187 deletions(-)
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index ea00d3fdaea2924..8dfd4e2f26ecf8f 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -86,10 +86,14 @@ set(files
   __algorithm/pstl_backends/cpu_backends/transform.h
   __algorithm/pstl_backends/cpu_backends/transform_reduce.h
   __algorithm/pstl_backends/gpu_backend.h
+  __algorithm/pstl_backends/gpu_backends/any_of.h
   __algorithm/pstl_backends/gpu_backends/backend.h
   __algorithm/pstl_backends/gpu_backends/fill.h
+  __algorithm/pstl_backends/gpu_backends/find_if.h
   __algorithm/pstl_backends/gpu_backends/for_each.h
+  __algorithm/pstl_backends/gpu_backends/merge.h
   __algorithm/pstl_backends/gpu_backends/omp_offload.h
+  __algorithm/pstl_backends/gpu_backends/stable_sort.h
   __algorithm/pstl_backends/gpu_backends/transform.h
   __algorithm/pstl_backends/gpu_backends/transform_reduce.h
   __algorithm/pstl_copy.h
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index ea7f39dea905474..f41332fbf9f6d42 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -14,8 +14,12 @@
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 
 #if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
+#  include <__algorithm/pstl_backends/gpu_backends/any_of.h>
 #  include <__algorithm/pstl_backends/gpu_backends/fill.h>
+#  include <__algorithm/pstl_backends/gpu_backends/find_if.h>
 #  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#  include <__algorithm/pstl_backends/gpu_backends/merge.h>
+#  include <__algorithm/pstl_backends/gpu_backends/stable_sort.h>
 #  include <__algorithm/pstl_backends/gpu_backends/transform.h>
 #  include <__algorithm/pstl_backends/gpu_backends/transform_reduce.h>
 #endif
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
new file mode 100644
index 000000000000000..8d911de55dcd685
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
+
+#include <__algorithm/any_of.h>
+#include <__algorithm/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__atomic/atomic.h>
+#include <__atomic/memory_order.h>
+#include <__config>
+#include <__functional/operations.h>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/pair.h>
+#include <__utility/terminate_on_exception.h>
+#include <cstdint>
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI bool
+__pstl_any_of(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+  // TODO: Implement GPU backend
+  return std::__pstl_any_of<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index 32926da87e2a083..8dc6bc6a6179c0e 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -14,6 +14,7 @@
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/terminate_on_exception.h>
 #include <stdio.h>
@@ -33,23 +34,12 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
   }
-  // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    std::__terminate_on_exception([&] {
-      __par_backend::__parallel_for(
-          __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
-            std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{}, __brick_first, __brick_last, __value);
-          });
-    });
-    // Else we execute for_each in serial
-  } else {
-    std::fill(__first, __last, __value);
-  }
+  // Otherwise, we execute for_each on the CPU instead
+  return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
new file mode 100644
index 000000000000000..2d34938f92dff38
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
+
+#include <__algorithm/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__atomic/atomic.h>
+#include <__config>
+#include <__functional/operations.h>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/pair.h>
+#include <__utility/terminate_on_exception.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI _ForwardIterator
+__pstl_find_if(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+  // TODO: Implement the GPU backend
+  return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index 14de2af8e4a15c6..23c8da27e64ae3b 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -33,23 +33,12 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
   // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    std::__terminate_on_exception([&] {
-      std::__par_backend::__parallel_for(
-          __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
-            std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{}, __brick_first, __brick_last, __func);
-          });
-    });
-    // Else we execute for_each in serial
-  } else {
-    std::for_each(__first, __last, __func);
-  }
+  return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
new file mode 100644
index 000000000000000..bc947ebb27ac7f0
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
+
+#include <__algorithm/merge.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/move.h>
+#include <__utility/terminate_on_exception.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy,
+          class _ForwardIterator1,
+          class _ForwardIterator2,
+          class _ForwardOutIterator,
+          class _Comp>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
+    __gpu_backend_tag,
+    _ForwardIterator1 __first1,
+    _ForwardIterator1 __last1,
+    _ForwardIterator2 __first2,
+    _ForwardIterator2 __last2,
+    _ForwardOutIterator __result,
+    _Comp __comp) {
+  // TODO: Implement GPU backend
+  return std::__pstl_merge<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first1, __last1, __first2, __last2, __result, __comp);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index d1cc6133f8e0876..36acafd448ec003 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -11,9 +11,19 @@
 
 #include <__assert>
 #include <__config>
+#include <__functional/operations.h>
+#include <__iterator/wrap_iter.h>
+#include <__memory/addressof.h>
+#include <__memory/pointer_traits.h>
+#include <__type_traits/is_pointer.h>
+#include <__type_traits/is_same.h>
 #include <__utility/move.h>
 #include <cstddef>
 
+// is_same
+
+// __libcpp_is_contiguous_iterator
+
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
@@ -30,27 +40,33 @@ inline namespace __omp_gpu_backend {
 
 // Checking if a pointer is in a range
 template <typename T1, typename T2, typename T3>
-_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1 a, T2 p, T3 b) {
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1, T2, T3) {
   return false;
 }
 
-template <typename T>
-_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T* a, T* p, T* b) {
-  return std::less_equal<T*>{}(a, p) && std::less<T*>{}(p, b);
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(_Tp* a, _Tp* p, _Tp* b) {
+  return std::less_equal<_Tp*>{}(a, p) && std::less<_Tp*>{}(p, b);
 }
 
 // In OpenMP, we need to extract the pointer for the underlying data for data
 // structures like std::vector and std::array to be able to map the data to the
 // device.
 
-template <typename T>
-_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(T p) {
+template <typename _Tp, std::enable_if<std::is_pointer<_Tp>::value >::type* = 0>
+_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(_Tp p) {
   return p;
 }
 
-template <typename T>
-_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
-  std::pointer_traits<std::__wrap_iter<T>> PT;
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) {
+  return std::addressof(*p);
+  ;
+}
+
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w) {
+  std::pointer_traits<std::__wrap_iter<_Tp>> PT;
   return PT.to_address(w);
 }
 
@@ -61,8 +77,8 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
 // Applying function or lambda in a loop
 
 template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator
-__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f, const int __device = 0) noexcept {
+_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(
+    _Iterator __first, _DifferenceType __n, _Function __f, [[maybe_unused]] const int __device = 0) noexcept {
 #  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
@@ -82,7 +98,7 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
 
 template <class _Index, class _DifferenceType, class _Tp>
 _LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
-    _Index __first, _DifferenceType __n, const _Tp& __value, const int __device = 0) noexcept {
+    _Index __first, _DifferenceType __n, const _Tp& __value, [[maybe_unused]] const int __device = 0) noexcept {
 #  pragma omp target teams distribute parallel for simd map(from : __first[0 : __n]) map(always, to : __value)         \
       device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
@@ -104,20 +120,24 @@ __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __valu
 
 template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
-    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f, const int __device = 0) noexcept {
+    _Iterator1 __first1,
+    _DifferenceType __n,
+    _Iterator2 __first2,
+    _Function __f,
+    [[maybe_unused]] const int __device = 0) noexcept {
   if ((!std::is_same<_Iterator1, _Iterator2>::value) ||
       (std::is_same<_Iterator1, _Iterator2>::value &&
        !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first2, __first1 + __n))) {
 #  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n]) map(from : __first2[0 : __n])      \
       device(__device)
     for (_DifferenceType __i = 0; __i < __n; ++__i)
-      __f(__first1[__i], __first2[__i]);
+      __first2[__i] = __f(__first1[__i]);
     return __first1 + __n;
   }
 #  pragma omp target teams distribute parallel for simd map(tofrom : __first1[0 : __n], __first2[0 : __n])             \
       device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(__first1[__i], __first2[__i]);
+    __first2[__i] = __f(__first1[__i]);
 
   return __first1 + __n;
 }
@@ -146,7 +166,7 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
     _Iterator2 __first2,
     _Iterator3 __first3,
     _Function __f,
-    const int __device = 0) noexcept {
+    [[maybe_unused]] const int __device = 0) noexcept {
   // It may be that __first3 is in the interval [__first1+__n) or [__firt2+__n)
   // It is, however, undefined behavior to compare two pointers that do not
   // point to the same object or are not the same type.
@@ -165,14 +185,14 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
 #  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n], __first2[0 : __n])                 \
       map(from : __first3[0 : __n]) device(__device)
     for (_DifferenceType __i = 0; __i < __n; ++__i)
-      __f(__first1[__i], __first2[__i], __first3[__i]);
+      __first3[__i] = __f(__first1[__i], __first2[__i]);
     return __first1 + __n;
   }
   // In the general case, we have to map all data to and from the device
 #  pragma omp target teams distribute parallel for simd map(                                                           \
           tofrom : __first1[0 : __n], __first2[0 : __n], __first3[0 : __n]) device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(__first1[__i], __first2[__i], __first3[__i]);
+    __first3[__i] = __f(__first1[__i], __first2[__i]);
 
   return __first1 + __n;
 }
@@ -197,46 +217,44 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
 
 // General case
 
-#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                            \
+#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                                 \
     template <class _Iterator,                                                                                                   \
               class _DifferenceType,                                                                                             \
               typename _Tp,                                                                                                      \
               typename _BinaryOperationType,                                                                                     \
-              typename _UnaryOperation,                                                                                          \
-              __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 >                                                                     \
+              typename _UnaryOperation>                                                                     \
     _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                               \
         _Iterator __first,                                                                                                       \
         _DifferenceType __n,                                                                                                     \
         _Tp __init,                                                                                                              \
         std_op<_BinaryOperationType> __reduce,                                                                                   \
-        _UnaryOperation __transform,                                                                                             \
-        const int __device = 0) noexcept {                                                                                       \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n]) device(__device)) \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                            \
-        __init = __reduce(__init, __transform(__first[__i]));                                                                    \
-      return __init;                                                                                                             \
+        _UnaryOperation __transform/*,                                                                                             \
+        [[maybe_unused]] const int __device = 0*/) noexcept {    \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n])) /*device(__device))*/ \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                 \
+        __init = __reduce(__init, __transform(__first[__i]));                                                                         \
+      return __init;                                                                                                                  \
     }
 
-#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                                                \
+#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                                                     \
     template <class _Iterator1,                                                                                                                      \
               class _Iterator2,                                                                                                                      \
               class _DifferenceType,                                                                                                                 \
               typename _Tp,                                                                                                                          \
               typename _BinaryOperationType,                                                                                                         \
-              typename _UnaryOperation,                                                                                                              \
-              __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 >                                                                                         \
+              typename _UnaryOperation >                                                                                         \
     _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                                                   \
         _Iterator1 __first1,                                                                                                                         \
         _Iterator2 __first2,                                                                                                                         \
         _DifferenceType __n,                                                                                                                         \
         _Tp __init,                                                                                                                                  \
         std_op<_BinaryOperationType> __reduce,                                                                                                       \
-        _UnaryOperation __transform,                                                                                                                 \
-        const int __device = 0) noexcept {                                                                                                           \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]) device(__device)) \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                                \
-        __init = __reduce(__init, __transform(__first1[__i], __first2[__i]));                                                                        \
-      return __init;                                                                                                                                 \
+        _UnaryOperation __transform/*,                                                                                                                 \
+        [[maybe_unused]] const int __device = 0*/) noexcept {    \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]))/* device(__device))*/ \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                                     \
+        __init = __reduce(__init, __transform(__first1[__i], __first2[__i]));                                                                             \
+      return __init;                                                                                                                                      \
     }
 
 #  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
@@ -276,7 +294,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
     _Tp __init,
     _BinaryOperation __reduce,
     _UnaryOperation __transform,
-    const int __device = 0) noexcept {
+    [[maybe_unused]] const int __device = 0) noexcept {
   return __omp_parallel_for_simd_reduction_1(
       __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
 }
@@ -294,7 +312,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_2(
     _Tp __init,
     _BinaryOperation __reduce,
     _UnaryOperation __transform,
-    const int __device = 0) noexcept {
+    [[maybe_unused]] const int __device = 0) noexcept {
   return __omp_parallel_for_simd_reduction_2(
       __omp_gpu_backend::__omp_extract_base_ptr(__first1),
       __omp_gpu_backend::__omp_extract_base_ptr(__first2),
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
new file mode 100644
index 000000000000000..1760a9fd9fc9d32
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
+
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/stable_sort.h>
+#include <__config>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_stable_sort(__gpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
+  // TODO: Implement GPU backend.
+  return __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index 7fcfde44aaaa7a6..10f6e5ff174d675 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -37,30 +37,13 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     _UnaryOperation __op) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     // While the CPU backend captures by reference, [&], that is not valid when
     // offloading to the GPU. Therefore we must capture by value, [=].
-    return std::__par_backend::__parallel_for_simd_2(
-        __first,
-        __last - __first,
-        __result,
-        [=](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
-          __out_value = __op(__in_value);
-        });
-  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
-                       __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    std::__terminate_on_exception([&] {
-      std::__par_backend::__parallel_for(
-          __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
-            return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op);
-          });
-    });
-    return __result + (__last - __first);
-  } else {
-    return std::transform(__first, __last, __result, __op);
+    return std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
   }
+  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
 }
 
 template <class _ExecutionPolicy,
@@ -79,39 +62,15 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     // While the CPU backend captures by reference, [&], that is not valid when
     // offloading to the GPU. Therefore we must capture by value, [=].
-    return std::__par_backend::__parallel_for_simd_3(
-        __first1,
-        __last1 - __first1,
-        __first2,
-        __result,
-        [=](__iter_reference<_ForwardIterator1> __in1,
-            __iter_reference<_ForwardIterator2> __in2,
-            __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); });
-  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
-                       __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    std::__terminate_on_exception([&] {
-      std::__par_backend::__parallel_for(
-          __first1,
-          __last1,
-          [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) {
-            return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{},
-                __brick_first,
-                __brick_last,
-                __first2 + (__brick_first - __first1),
-                __result + (__brick_first - __first1),
-                __op);
-          });
-    });
-    return __result + (__last1 - __first1);
-  } else {
-    return std::transform(__first1, __last1, __first2, __result, __op);
+    return std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
   }
+  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
index 43e5631aef04afb..8590dd3d024ea69 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -12,9 +12,11 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__config>
+#include <__functional/operations.h>
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
 #include <__numeric/transform_reduce.h>
+#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/operation_traits.h>
@@ -28,6 +30,25 @@
 
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+template <class _T1, class _T2, class _T3>
+struct __is_supported_reduction : std::false_type {};
+
+#  define __PSTL_IS_SUPPORTED_REDUCTION(funname)                                                                       \
+    template <class _Tp>                                                                                               \
+    struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};                                  \
+    template <class _Tp, class _Up>                                                                                    \
+    struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+
+// __is_trivial_plus_operation already exists
+__PSTL_IS_SUPPORTED_REDUCTION(plus)
+__PSTL_IS_SUPPORTED_REDUCTION(minus)
+__PSTL_IS_SUPPORTED_REDUCTION(multiplies)
+__PSTL_IS_SUPPORTED_REDUCTION(logical_and)
+__PSTL_IS_SUPPORTED_REDUCTION(logical_or)
+__PSTL_IS_SUPPORTED_REDUCTION(bit_and)
+__PSTL_IS_SUPPORTED_REDUCTION(bit_or)
+__PSTL_IS_SUPPORTED_REDUCTION(bit_xor)
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 //===----------------------------------------------------------------------===//
@@ -50,49 +71,16 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _BinaryOperation2 __transform) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+                (__is_trivial_plus_operation<_BinaryOperation1, _Tp, _Tp>::value ||
+                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
-        std::move(__first1),
-        std::move(__first2),
-        __last1 - __first1,
-        std::move(__init),
-        std::move(__reduce),
-        [=](__iter_reference<_ForwardIterator1> __in_value_1, __iter_reference<_ForwardIterator1> __in_value_2) {
-          return __transform(__in_value_1, __in_value_2);
-        });
-  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
-    return std::__terminate_on_exception([&] {
-      return __par_backend::__parallel_transform_reduce(
-          __first1,
-          std::move(__last1),
-          [__first1, __first2, __transform](_ForwardIterator1 __iter) {
-            return __transform(*__iter, *(__first2 + (__iter - __first1)));
-          },
-          std::move(__init),
-          std::move(__reduce),
-          [__first1, __first2, __reduce, __transform](
-              _ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last, _Tp __brick_init) {
-            return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{},
-                __brick_first,
-                std::move(__brick_last),
-                __first2 + (__brick_first - __first1),
-                std::move(__brick_init),
-                std::move(__reduce),
-                std::move(__transform));
-          });
-    });
-  } else {
-    return std::transform_reduce(
-        std::move(__first1),
-        std::move(__last1),
-        std::move(__first2),
-        std::move(__init),
-        std::move(__reduce),
-        std::move(__transform));
+        __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
   }
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first1, __last1, __first2, std::move(__init), __reduce, __transform);
 }
 
 //===----------------------------------------------------------------------===//
@@ -108,36 +96,15 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _BinaryOperation __reduce,
     _UnaryOperation __transform) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+                (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
+                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
-        std::move(__first),
-        __last - __first,
-        std::move(__init),
-        std::move(__reduce),
-        [=](__iter_reference<_ForwardIterator> __in_value) { return __transform(__in_value); });
-  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return std::__terminate_on_exception([&] {
-      return __par_backend::__parallel_transform_reduce(
-          std::move(__first),
-          std::move(__last),
-          [__transform](_ForwardIterator __iter) { return __transform(*__iter); },
-          std::move(__init),
-          __reduce,
-          [__transform, __reduce](auto __brick_first, auto __brick_last, _Tp __brick_init) {
-            return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{},
-                std::move(__brick_first),
-                std::move(__brick_last),
-                std::move(__brick_init),
-                std::move(__reduce),
-                std::move(__transform));
-          });
-    });
-  } else {
-    return std::transform_reduce(
-        std::move(__first), std::move(__last), std::move(__init), std::move(__reduce), std::move(__transform));
+        __first, __last - __first, __init, __reduce, __transform);
   }
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
 }
 
 _LIBCPP_END_NAMESPACE_STD

>From 94474817194a47d06a11567dd0e9aeabc7b6da63 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Tue, 26 Sep 2023 20:15:33 -0700
Subject: [PATCH 08/24] Missing return statements in fill and for_each

---
 libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h    | 2 +-
 .../include/__algorithm/pstl_backends/gpu_backends/for_each.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index 8dc6bc6a6179c0e..d109495009df895 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -36,7 +36,7 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+    return std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
   }
   // Otherwise, we execute for_each on the CPU instead
   return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index 23c8da27e64ae3b..bab0c87de8f2fc7 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -35,7 +35,7 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+    return std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
   // Else if the excution policy is parallel, we execute for_each on the CPU instead
   return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);

>From 96adadf8f7227f6543537056f27f98cb18bbe8ce Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 27 Sep 2023 10:31:16 -0700
Subject: [PATCH 09/24] Passing all LIT tests

---
 .../__algorithm/pstl_backends/gpu_backends/fill.h | 10 ++++++----
 .../pstl_backends/gpu_backends/for_each.h         |  8 +++++---
 .../pstl_backends/gpu_backends/stable_sort.h      |  2 +-
 .../pstl_backends/gpu_backends/transform.h        | 15 ++++++++-------
 .../pstl_backends/gpu_backends/transform_reduce.h |  6 +++---
 5 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index d109495009df895..f32ee8b016b3eae 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -30,16 +30,18 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
-  // It is only safe to execute for_each on the GPU, it the execution policy is
+  // It is only safe to execute fill on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+    std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+  }
+  // Otherwise, we execute fill on the CPU instead
+  else {
+    std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
   }
-  // Otherwise, we execute for_each on the CPU instead
-  return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index bab0c87de8f2fc7..f96b30b5ba25b24 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -35,10 +35,12 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+    std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+  }
+  // Else we fall back to the GPU backend
+  else {
+    std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
   }
-  // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
index 1760a9fd9fc9d32..5cd7081ef73e9c1 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
@@ -28,7 +28,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_stable_sort(__gpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  return __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+  __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index 10f6e5ff174d675..c2e43cb6d643375 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -38,11 +38,12 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    // While the CPU backend captures by reference, [&], that is not valid when
-    // offloading to the GPU. Therefore we must capture by value, [=].
-    return std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
+    std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
+    return __result + (__last - __first);
   }
+  // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
 }
 
@@ -66,10 +67,10 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    // While the CPU backend captures by reference, [&], that is not valid when
-    // offloading to the GPU. Therefore we must capture by value, [=].
-    return std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
+    std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
+    return __result + (__last1 - __first1);
   }
+  // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
index 8590dd3d024ea69..332bb8abc1b8e0b 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -31,13 +31,13 @@
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 template <class _T1, class _T2, class _T3>
-struct __is_supported_reduction : std::false_type {};
+_LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction : std::false_type {};
 
 #  define __PSTL_IS_SUPPORTED_REDUCTION(funname)                                                                       \
     template <class _Tp>                                                                                               \
-    struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};                                  \
+    _LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};            \
     template <class _Tp, class _Up>                                                                                    \
-    struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+    _LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
 
 // __is_trivial_plus_operation already exists
 __PSTL_IS_SUPPORTED_REDUCTION(plus)

>From ec5c6cb23d06996b23928bd924a736f8b332c838 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Tue, 3 Oct 2023 20:58:11 -0700
Subject: [PATCH 10/24] Handling value iterators in PSTL GPU backend for C++ 20

---
 .../pstl_backends/gpu_backends/omp_offload.h  | 219 ++++++++++--------
 .../pstl_backends/gpu_backends/transform.h    |   6 +
 .../gpu_backends/transform_reduce.h           |  24 +-
 3 files changed, 138 insertions(+), 111 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 36acafd448ec003..81ec1bbc63d0083 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -12,6 +12,7 @@
 #include <__assert>
 #include <__config>
 #include <__functional/operations.h>
+#include <__iterator/iterator_traits.h>
 #include <__iterator/wrap_iter.h>
 #include <__memory/addressof.h>
 #include <__memory/pointer_traits.h>
@@ -38,36 +39,66 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
-// Checking if a pointer is in a range
-template <typename T1, typename T2, typename T3>
-_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1, T2, T3) {
-  return false;
+// Functions for eaxtracting the pase pointers
+
+// In the general case we do not need to extract it. This is for instance the
+// case for pointers.
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) noexcept {
+  return p;
 }
 
+// For vectors and arrays, etc, we need to extract the underlying base pointer.
 template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(_Tp* a, _Tp* p, _Tp* b) {
-  return std::less_equal<_Tp*>{}(a, p) && std::less<_Tp*>{}(p, b);
+_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w) noexcept {
+  std::pointer_traits<std::__wrap_iter<_Tp>> PT;
+  return PT.to_address(w);
 }
 
-// In OpenMP, we need to extract the pointer for the underlying data for data
-// structures like std::vector and std::array to be able to map the data to the
-// device.
+//===----------------------------------------------------------------------===//
+// The following four functions differentiates between contiguous iterators and
+// non-contiguous iterators. That allows to use the same implementations for
+// reference and value iterators
+//===----------------------------------------------------------------------===//
 
-template <typename _Tp, std::enable_if<std::is_pointer<_Tp>::value >::type* = 0>
-_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(_Tp p) {
-  return p;
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_to([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
+#  pragma omp target enter data map(to : p[0 : len])
+  } else {
+#  pragma omp target enter data map(to : p)
+  }
 }
 
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) {
-  return std::addressof(*p);
-  ;
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_from([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
+#  pragma omp target exit data map(from : p[0 : len])
+  } else {
+#  pragma omp target exit data map(release : p)
+  }
 }
 
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w) {
-  std::pointer_traits<std::__wrap_iter<_Tp>> PT;
-  return PT.to_address(w);
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_alloc([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
+#  pragma omp target enter data map(alloc : p[0 : len])
+  } else {
+#  pragma omp target enter data map(to : p)
+  }
+}
+
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
+#  pragma omp target exit data map(release : p[0 : len])
+  } else {
+#  pragma omp target exit data map(release : p)
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -79,9 +110,11 @@ _LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w)
 template <class _Iterator, class _DifferenceType, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(
     _Iterator __first, _DifferenceType __n, _Function __f, [[maybe_unused]] const int __device = 0) noexcept {
-#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) device(__device)
+  __omp_map_to(__first, __n);
+#  pragma omp target teams distribute parallel for simd device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
+  __omp_map_from(__first, __n);
 
   return __first + __n;
 }
@@ -99,11 +132,11 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
 template <class _Index, class _DifferenceType, class _Tp>
 _LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
     _Index __first, _DifferenceType __n, const _Tp& __value, [[maybe_unused]] const int __device = 0) noexcept {
-#  pragma omp target teams distribute parallel for simd map(from : __first[0 : __n]) map(always, to : __value)         \
-      device(__device)
+  __omp_map_alloc(__first, __n);
+#  pragma omp target teams distribute parallel for simd firstprivate(__value) device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __first[__i] = __value;
-
+  __omp_map_from(__first, __n);
   return __first + __n;
 }
 
@@ -125,20 +158,13 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
     _Iterator2 __first2,
     _Function __f,
     [[maybe_unused]] const int __device = 0) noexcept {
-  if ((!std::is_same<_Iterator1, _Iterator2>::value) ||
-      (std::is_same<_Iterator1, _Iterator2>::value &&
-       !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first2, __first1 + __n))) {
-#  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n]) map(from : __first2[0 : __n])      \
-      device(__device)
-    for (_DifferenceType __i = 0; __i < __n; ++__i)
-      __first2[__i] = __f(__first1[__i]);
-    return __first1 + __n;
-  }
-#  pragma omp target teams distribute parallel for simd map(tofrom : __first1[0 : __n], __first2[0 : __n])             \
-      device(__device)
+  __omp_map_alloc(__first2, __n);
+  __omp_map_to(__first1, __n);
+#  pragma omp target teams distribute parallel for simd device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __first2[__i] = __f(__first1[__i]);
-
+    *(__first2 + __i) = __f(*(__first1 + __i));
+  __omp_map_from(__first2, __n);
+  __omp_map_free(__first1, __n);
   return __first1 + __n;
 }
 
@@ -167,33 +193,15 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
     _Iterator3 __first3,
     _Function __f,
     [[maybe_unused]] const int __device = 0) noexcept {
-  // It may be that __first3 is in the interval [__first1+__n) or [__firt2+__n)
-  // It is, however, undefined behavior to compare two pointers that do not
-  // point to the same object or are not the same type.
-  // If we can prove that __first3 is not in any of the ranges [__first1+__n)
-  // or [__firt2+__n), it is safe to reduce the amount of data copied to and
-  // from the device
-  constexpr bool are_not_same_type =
-      !std::is_same<_Iterator1, _Iterator2>::value && !std::is_same<_Iterator1, _Iterator3>::value;
-  const bool no_overlap_13 =
-      std::is_same<_Iterator1, _Iterator3>::value &&
-      !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first3, __first1 + __n);
-  const bool no_overlap_23 =
-      std::is_same<_Iterator2, _Iterator3>::value &&
-      !__omp_gpu_backend::__omp_in_ptr_range(__first2, __first3, __first2 + __n);
-  if (are_not_same_type || (no_overlap_13 && no_overlap_23)) {
-#  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n], __first2[0 : __n])                 \
-      map(from : __first3[0 : __n]) device(__device)
-    for (_DifferenceType __i = 0; __i < __n; ++__i)
-      __first3[__i] = __f(__first1[__i], __first2[__i]);
-    return __first1 + __n;
-  }
-  // In the general case, we have to map all data to and from the device
-#  pragma omp target teams distribute parallel for simd map(                                                           \
-          tofrom : __first1[0 : __n], __first2[0 : __n], __first3[0 : __n]) device(__device)
+  __omp_map_to(__first1, __n);
+  __omp_map_to(__first2, __n);
+  __omp_map_alloc(__first3, __n);
+#  pragma omp target teams distribute parallel for simd device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __first3[__i] = __f(__first1[__i], __first2[__i]);
-
+    *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
+  __omp_map_free(__first1, __n);
+  __omp_map_free(__first2, __n);
+  __omp_map_from(__first3, __n);
   return __first1 + __n;
 }
 
@@ -215,47 +223,54 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
 // Templates for reductions
 //===----------------------------------------------------------------------===//
 
-// General case
-
-#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                                 \
-    template <class _Iterator,                                                                                                   \
-              class _DifferenceType,                                                                                             \
-              typename _Tp,                                                                                                      \
-              typename _BinaryOperationType,                                                                                     \
-              typename _UnaryOperation>                                                                     \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                               \
-        _Iterator __first,                                                                                                       \
-        _DifferenceType __n,                                                                                                     \
-        _Tp __init,                                                                                                              \
-        std_op<_BinaryOperationType> __reduce,                                                                                   \
-        _UnaryOperation __transform/*,                                                                                             \
-        [[maybe_unused]] const int __device = 0*/) noexcept {    \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n])) /*device(__device))*/ \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                 \
-        __init = __reduce(__init, __transform(__first[__i]));                                                                         \
-      return __init;                                                                                                                  \
+// In the two following function templates, we map the pointer to the device in
+// different ways depending on if they are contiguou or not.
+
+#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                  \
+    template <class _Iterator,                                                                                         \
+              class _DifferenceType,                                                                                   \
+              typename _Tp,                                                                                            \
+              typename _BinaryOperationType,                                                                           \
+              typename _UnaryOperation>                                                                                \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                     \
+        _Iterator __first,                                                                                             \
+        _DifferenceType __n,                                                                                           \
+        _Tp __init,                                                                                                    \
+        std_op<_BinaryOperationType> __reduce,                                                                         \
+        _UnaryOperation __transform,                                                                                   \
+        [[maybe_unused]] const int __device = 0) noexcept {                                                            \
+      __omp_map_to(__first, __n);                                                                                      \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) device(__device))                  \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
+        __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
+      __omp_map_free(__first, __n);                                                                                    \
+      return __init;                                                                                                   \
     }
 
-#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                                                     \
-    template <class _Iterator1,                                                                                                                      \
-              class _Iterator2,                                                                                                                      \
-              class _DifferenceType,                                                                                                                 \
-              typename _Tp,                                                                                                                          \
-              typename _BinaryOperationType,                                                                                                         \
-              typename _UnaryOperation >                                                                                         \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                                                   \
-        _Iterator1 __first1,                                                                                                                         \
-        _Iterator2 __first2,                                                                                                                         \
-        _DifferenceType __n,                                                                                                                         \
-        _Tp __init,                                                                                                                                  \
-        std_op<_BinaryOperationType> __reduce,                                                                                                       \
-        _UnaryOperation __transform/*,                                                                                                                 \
-        [[maybe_unused]] const int __device = 0*/) noexcept {    \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]))/* device(__device))*/ \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                                     \
-        __init = __reduce(__init, __transform(__first1[__i], __first2[__i]));                                                                             \
-      return __init;                                                                                                                                      \
-    }
+#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                  \
+    template <class _Iterator1,                                                                                        \
+              class _Iterator2,                                                                                        \
+              class _DifferenceType,                                                                                   \
+              typename _Tp,                                                                                            \
+              typename _BinaryOperationType,                                                                           \
+              typename _UnaryOperation >                                                                               \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                     \
+        _Iterator1 __first1,                                                                                           \
+        _Iterator2 __first2,                                                                                           \
+        _DifferenceType __n,                                                                                           \
+        _Tp __init,                                                                                                    \
+        std_op<_BinaryOperationType> __reduce,                                                                         \
+        _UnaryOperation __transform,                                                                                   \
+        [[maybe_unused]] const int __device = 0) noexcept {                                                            \
+      __omp_map_to(__first1, __n);                                                                                     \
+      __omp_map_to(__first2, __n);                                                                                     \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) device(__device))                  \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
+        __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
+      __omp_map_free(__first1, __n);                                                                                   \
+      __omp_map_free(__first2, __n);                                                                                   \
+      return __init;                                                                                                   \
+    } // namespace __omp_gpu_backend
 
 #  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
     __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                        \
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index c2e43cb6d643375..3af2106eb2bda21 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -35,10 +35,13 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     _ForwardIterator __last,
     _ForwardOutIterator __result,
     _UnaryOperation __op) {
+  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
+#  if _LIBCPP_STD_VER <= 17
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+#  endif
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
@@ -60,12 +63,15 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     _ForwardIterator2 __first2,
     _ForwardOutIterator __result,
     _BinaryOperation __op) {
+  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
+#  if _LIBCPP_STD_VER <= 17
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
+#  endif
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
index 332bb8abc1b8e0b..eeffe62c040f083 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -30,14 +30,16 @@
 
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_BEGIN_NAMESPACE_STD
+
 template <class _T1, class _T2, class _T3>
-_LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction : std::false_type {};
+struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction : std::false_type {};
 
 #  define __PSTL_IS_SUPPORTED_REDUCTION(funname)                                                                       \
     template <class _Tp>                                                                                               \
-    _LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};            \
+    struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};            \
     template <class _Tp, class _Up>                                                                                    \
-    _LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+    struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
 
 // __is_trivial_plus_operation already exists
 __PSTL_IS_SUPPORTED_REDUCTION(plus)
@@ -49,8 +51,6 @@ __PSTL_IS_SUPPORTED_REDUCTION(bit_and)
 __PSTL_IS_SUPPORTED_REDUCTION(bit_or)
 __PSTL_IS_SUPPORTED_REDUCTION(bit_xor)
 
-_LIBCPP_BEGIN_NAMESPACE_STD
-
 //===----------------------------------------------------------------------===//
 // Two input iterators
 //===----------------------------------------------------------------------===//
@@ -69,11 +69,14 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _Tp __init,
     _BinaryOperation1 __reduce,
     _BinaryOperation2 __transform) {
+  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+#  if _LIBCPP_STD_VER <= 17
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
+#  endif
                 (__is_trivial_plus_operation<_BinaryOperation1, _Tp, _Tp>::value ||
                  __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
@@ -95,9 +98,12 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _Tp __init,
     _BinaryOperation __reduce,
     _UnaryOperation __transform) {
+  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+#  if _LIBCPP_STD_VER <= 17
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+#  endif
                 (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
                  __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_1(

>From 7971d8126a40ad74acef3ee91105e7d4d05ac9bf Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 6 Oct 2023 11:42:25 -0700
Subject: [PATCH 11/24] Restructuring pstl_backends/gpu/ to
 pstl_backends/openmp

---
 libcxx/CMakeLists.txt                         | 36 ++++++++-----------
 libcxx/include/CMakeLists.txt                 | 22 ++++++------
 libcxx/include/__algorithm/pstl_backend.h     | 34 +++++++++++++-----
 .../__algorithm/pstl_backends/cpu_backend.h   | 23 +++++++-----
 .../pstl_backends/cpu_backends/backend.h      |  2 ++
 .../__algorithm/pstl_backends/gpu_backend.h   | 27 --------------
 .../__algorithm/pstl_backends/openmp.h        | 27 ++++++++++++++
 .../{gpu_backends => openmp}/any_of.h         | 13 ++++---
 .../{gpu_backends => openmp}/backend.h        | 16 +++------
 .../{gpu_backends => openmp}/fill.h           | 13 ++++---
 .../{gpu_backends => openmp}/find_if.h        | 13 ++++---
 .../{gpu_backends => openmp}/for_each.h       | 19 +++++-----
 .../{gpu_backends => openmp}/merge.h          | 14 ++++----
 .../{gpu_backends => openmp}/omp_offload.h    |  6 ++--
 .../{gpu_backends => openmp}/stable_sort.h    | 13 ++++---
 .../{gpu_backends => openmp}/transform.h      | 25 ++++++-------
 .../transform_reduce.h                        | 23 +++++-------
 libcxx/include/__config_site.in               |  3 +-
 18 files changed, 161 insertions(+), 168 deletions(-)
 delete mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backend.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/openmp.h
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/any_of.h (70%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/backend.h (58%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/fill.h (78%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/find_if.h (71%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/for_each.h (74%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/merge.h (73%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/omp_offload.h (98%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/stable_sort.h (68%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/transform.h (76%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/transform_reduce.h (85%)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 16c3bc0a3976918..1272f13df5293fa 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -290,19 +290,14 @@ option(LIBCXX_HAS_WIN32_THREAD_API "Ignore auto-detection and force use of win32
 option(LIBCXX_HAS_EXTERNAL_THREAD_API
   "Build libc++ with an externalized threading API.
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
-option(LIBCXX_ENABLE_GPU_OFFLOAD 
-  "Build libc++ with support for GPU offload" OFF)
 
-if (LIBCXX_ENABLE_THREADS)
-  set(LIBCXX_PSTL_CPU_BACKEND "std_thread" CACHE STRING "Which PSTL CPU backend to use")
-else()
-  set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
-endif()
-
-if (${LIBCXX_ENABLE_GPU_OFFLOAD})
-  set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
-else()
-  set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
+set(LIBCXX_PSTL_BACKEND "openmp" CACHE INTERNAL "Which PSTL backend to use")
+if (LIBCXX_PSTL_BACKEND STREQUAL "")
+  if (LIBCXX_ENABLE_THREADS)
+    set(LIBCXX_PSTL_BACKEND "std-thread")
+  else()
+    set(LIBCXX_PSTL_BACKEND "serial")
+  endif()
 endif()
 
 # Misc options ----------------------------------------------------------------
@@ -804,20 +799,17 @@ if (LIBCXX_ENABLE_ASSERTIONS)
   message(FATAL_ERROR "LIBCXX_ENABLE_ASSERTIONS has been replaced by LIBCXX_HARDENING_MODE=safe")
 endif()
 
-if (LIBCXX_PSTL_CPU_BACKEND STREQUAL "serial")
+if (LIBCXX_PSTL_BACKEND STREQUAL "serial")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_SERIAL)
-elseif(LIBCXX_PSTL_CPU_BACKEND STREQUAL "std_thread")
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "std-thread")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_THREAD)
-elseif(LIBCXX_PSTL_CPU_BACKEND STREQUAL "libdispatch")
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+  config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
 else()
-  message(FATAL_ERROR "LIBCXX_PSTL_CPU_BACKEND is set to ${LIBCXX_PSTL_CPU_BACKEND}, which is not a valid backend.
-                       Valid backends are: serial, std_thread and libdispatch")
-endif()
-
-config_define_if(LIBCXX_ENABLE_GPU_OFFLOAD _LIBCPP_PSTL_GPU_OFFLOAD)
-if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload")
-  config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+  message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
+                       Valid backends are: serial, std-thread, libdispatch, and openmp.")
 endif()
 
 if (LIBCXX_ABI_DEFINES)
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 8dfd4e2f26ecf8f..ab38408c1715627 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -85,17 +85,17 @@ set(files
   __algorithm/pstl_backends/cpu_backends/thread.h
   __algorithm/pstl_backends/cpu_backends/transform.h
   __algorithm/pstl_backends/cpu_backends/transform_reduce.h
-  __algorithm/pstl_backends/gpu_backend.h
-  __algorithm/pstl_backends/gpu_backends/any_of.h
-  __algorithm/pstl_backends/gpu_backends/backend.h
-  __algorithm/pstl_backends/gpu_backends/fill.h
-  __algorithm/pstl_backends/gpu_backends/find_if.h
-  __algorithm/pstl_backends/gpu_backends/for_each.h
-  __algorithm/pstl_backends/gpu_backends/merge.h
-  __algorithm/pstl_backends/gpu_backends/omp_offload.h
-  __algorithm/pstl_backends/gpu_backends/stable_sort.h
-  __algorithm/pstl_backends/gpu_backends/transform.h
-  __algorithm/pstl_backends/gpu_backends/transform_reduce.h
+  __algorithm/pstl_backends/openmp.h
+  __algorithm/pstl_backends/openmp/any_of.h
+  __algorithm/pstl_backends/openmp/backend.h
+  __algorithm/pstl_backends/openmp/fill.h
+  __algorithm/pstl_backends/openmp/find_if.h
+  __algorithm/pstl_backends/openmp/for_each.h
+  __algorithm/pstl_backends/openmp/merge.h
+  __algorithm/pstl_backends/openmp/omp_offload.h
+  __algorithm/pstl_backends/openmp/stable_sort.h
+  __algorithm/pstl_backends/openmp/transform.h
+  __algorithm/pstl_backends/openmp/transform_reduce.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index ed04b4f0cf2cc64..80c9db58e983458 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -10,7 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKEND_H
 
 #include <__algorithm/pstl_backends/cpu_backend.h>
-#include <__algorithm/pstl_backends/gpu_backend.h>
+#include <__algorithm/pstl_backends/openmp.h>
 #include <__config>
 #include <execution>
 
@@ -179,6 +179,9 @@ implemented, all the algorithms will eventually forward to the basis algorithms
 template <class _ExecutionPolicy>
 struct __select_backend;
 
+#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                          \
+      defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+
 template <>
 struct __select_backend<std::execution::sequenced_policy> {
   using type = __cpu_backend_tag;
@@ -191,25 +194,40 @@ struct __select_backend<std::execution::unsequenced_policy> {
 };
 #  endif
 
-#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                          \
-      defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 template <>
 struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
-#    if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
-  using type = __gpu_backend_tag;
+  using type = __cpu_backend_tag;
 };
-#    else
+
+#  elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+
 template <>
-struct __select_backend<std::execution::parallel_unsequenced_policy> {
-  using type = __cpu_backend_tag;
+struct __select_backend<std::execution::sequenced_policy> {
+  using type = __omp_backend_tag;
+};
+
+#    if _LIBCPP_STD_VER >= 20
+template <>
+struct __select_backend<std::execution::unsequenced_policy> {
+  using type = __omp_backend_tag;
 };
 #    endif
 
+template <>
+struct __select_backend<std::execution::parallel_policy> {
+  using type = __omp_backend_tag;
+};
+
+template <>
+struct __select_backend<std::execution::parallel_unsequenced_policy> {
+  using type = __omp_backend_tag;
+};
+
 #  else
 
 // ...New vendors can add parallel backends here...
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
index e54f331b9430b68..301ea391b33b73a 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
@@ -46,14 +46,19 @@
   TODO: Document the parallel backend
 */
 
-#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/cpu_backends/fill.h>
-#include <__algorithm/pstl_backends/cpu_backends/find_if.h>
-#include <__algorithm/pstl_backends/cpu_backends/for_each.h>
-#include <__algorithm/pstl_backends/cpu_backends/merge.h>
-#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
-#include <__algorithm/pstl_backends/cpu_backends/transform.h>
-#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
+#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                            \
+    defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+
+#  include <__algorithm/pstl_backends/cpu_backends/any_of.h>
+#  include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#  include <__algorithm/pstl_backends/cpu_backends/fill.h>
+#  include <__algorithm/pstl_backends/cpu_backends/find_if.h>
+#  include <__algorithm/pstl_backends/cpu_backends/for_each.h>
+#  include <__algorithm/pstl_backends/cpu_backends/merge.h>
+#  include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
+#  include <__algorithm/pstl_backends/cpu_backends/transform.h>
+#  include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
+
+#endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
index ea2210a4a7adbdb..b8e9b1e28201d16 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
@@ -18,6 +18,8 @@
 #  include <__algorithm/pstl_backends/cpu_backends/thread.h>
 #elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 #  include <__algorithm/pstl_backends/cpu_backends/libdispatch.h>
+#elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+// OpenMP covers both CPU and GPU backends
 #else
 #  error "Invalid CPU backend choice"
 #endif
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
deleted file mode 100644
index f41332fbf9f6d42..000000000000000
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ /dev/null
@@ -1,27 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
-
-#include <__config>
-
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
-
-#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
-#  include <__algorithm/pstl_backends/gpu_backends/any_of.h>
-#  include <__algorithm/pstl_backends/gpu_backends/fill.h>
-#  include <__algorithm/pstl_backends/gpu_backends/find_if.h>
-#  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
-#  include <__algorithm/pstl_backends/gpu_backends/merge.h>
-#  include <__algorithm/pstl_backends/gpu_backends/stable_sort.h>
-#  include <__algorithm/pstl_backends/gpu_backends/transform.h>
-#  include <__algorithm/pstl_backends/gpu_backends/transform_reduce.h>
-#endif
-
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
new file mode 100644
index 000000000000000..7787c82ff98825a
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+
+#include <__config>
+
+#include <__algorithm/pstl_backends/openmp/backend.h>
+
+#if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+#  include <__algorithm/pstl_backends/openmp/any_of.h>
+#  include <__algorithm/pstl_backends/openmp/fill.h>
+#  include <__algorithm/pstl_backends/openmp/find_if.h>
+#  include <__algorithm/pstl_backends/openmp/for_each.h>
+#  include <__algorithm/pstl_backends/openmp/merge.h>
+#  include <__algorithm/pstl_backends/openmp/stable_sort.h>
+#  include <__algorithm/pstl_backends/openmp/transform.h>
+#  include <__algorithm/pstl_backends/openmp/transform_reduce.h>
+#endif
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
similarity index 70%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index 8d911de55dcd685..2b9a88fc58edcdd 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -6,13 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_ANY_OF_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_ANY_OF_H
 
 #include <__algorithm/any_of.h>
 #include <__algorithm/find_if.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__atomic/atomic.h>
 #include <__atomic/memory_order.h>
 #include <__config>
@@ -29,13 +28,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI bool
-__pstl_any_of(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+__pstl_any_of(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement GPU backend
-  return std::__pstl_any_of<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+  return std::any_of(__first, __last, __pred);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_ANY_OF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
similarity index 58%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index a03ad35d8d2ae3e..e4e6136082a3427 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -6,19 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_BACKEND_H
 
 #include <__config>
 #include <cstddef>
 
-#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
-#  if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
-#    include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
-#  else
-#    error Invalid PSTL GPU backend
-#  endif
-#endif
+#include <__algorithm/pstl_backends/openmp/omp_offload.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -28,10 +22,10 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-struct __gpu_backend_tag {};
+struct __omp_backend_tag {};
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
similarity index 78%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index f32ee8b016b3eae..56341b04f0e50df 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FILL_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FILL_H
 
 #include <__algorithm/fill.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
@@ -29,7 +28,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI void
-__pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
+__pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   // It is only safe to execute fill on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
@@ -40,7 +39,7 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   }
   // Otherwise, we execute fill on the CPU instead
   else {
-    std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
+    std::fill(__first, __last, __value);
   }
 }
 
@@ -48,4 +47,4 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FILL_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
similarity index 71%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 2d34938f92dff38..1e78cb67cd75978 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FIND_IF_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FIND_IF_H
 
 #include <__algorithm/find_if.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__atomic/atomic.h>
 #include <__config>
 #include <__functional/operations.h>
@@ -32,13 +31,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI _ForwardIterator
-__pstl_find_if(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+__pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement the GPU backend
-  return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+  return std::find_if(__first, __last, __pred);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FIND_IF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
similarity index 74%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index f96b30b5ba25b24..401cdfade50a103 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FOR_EACH_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FOR_EACH_H
 
 #include <__algorithm/for_each.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
@@ -28,7 +27,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI void
-__pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
+__pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
   // It is only safe to execute for_each on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
@@ -36,10 +35,10 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
-  }
-  // Else we fall back to the GPU backend
-  else {
-    std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
+  } else {
+    // If it is not safe to offload to the GPU, we call the serial
+    // implementation
+    std::for_each(__first, __last, __func);
   }
 }
 
@@ -47,4 +46,4 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FOR_EACH_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
similarity index 73%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/merge.h
index bc947ebb27ac7f0..23036e1cc853980 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_MERGE_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_MERGE_H
 
 #include <__algorithm/merge.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
@@ -32,7 +31,7 @@ template <class _ExecutionPolicy,
           class _ForwardOutIterator,
           class _Comp>
 _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
     _ForwardIterator2 __first2,
@@ -40,12 +39,11 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
     _ForwardOutIterator __result,
     _Comp __comp) {
   // TODO: Implement GPU backend
-  return std::__pstl_merge<_ExecutionPolicy>(
-      __cpu_backend_tag{}, __first1, __last1, __first2, __last2, __result, __comp);
+  return std::merge(__first1, __last1, __first2, __last2, __result, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_MERGE_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
similarity index 98%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 81ec1bbc63d0083..ee0bfe13bac9010 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 
 #include <__assert>
 #include <__config>
@@ -346,4 +346,4 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
similarity index 68%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index 5cd7081ef73e9c1..7d28b8de77983a7 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
 
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/stable_sort.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
@@ -26,13 +25,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI void
-__pstl_stable_sort(__gpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
+__pstl_stable_sort(__omp_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+  std::stable_sort(__first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
similarity index 76%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index 3af2106eb2bda21..6dc7ab22c98cc35 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
 
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/transform.h>
 #include <__config>
 #include <__iterator/concepts.h>
@@ -30,7 +29,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
 _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator __first,
     _ForwardIterator __last,
     _ForwardOutIterator __result,
@@ -46,8 +45,8 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
   }
-  // If it is not safe to offload to the GPU, we rely on the CPU backend.
-  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
+  // If it is not safe to offload to the GPU, we rely on the serial backend.
+  return std::transform(__first, __last, __result, __op);
 }
 
 template <class _ExecutionPolicy,
@@ -57,7 +56,7 @@ template <class _ExecutionPolicy,
           class _BinaryOperation,
           enable_if_t<is_execution_policy_v<__remove_cvref_t<_ExecutionPolicy>>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
     _ForwardIterator2 __first2,
@@ -68,20 +67,16 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
-#  if _LIBCPP_STD_VER <= 17
-                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
-#  endif
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
   }
-  // If it is not safe to offload to the GPU, we rely on the CPU backend.
-  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
+  // If it is not safe to offload to the GPU, we call the serial implementation
+  return std::transform(__first1, __last1, __first2, __result, __op);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
similarity index 85%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index eeffe62c040f083..c63e456b314c1c8 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
 
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/concepts.h>
@@ -62,7 +61,7 @@ template <class _ExecutionPolicy,
           class _BinaryOperation1,
           class _BinaryOperation2>
 _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
     _ForwardIterator2 __first2,
@@ -82,8 +81,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
   }
-  return std::__pstl_transform_reduce<_ExecutionPolicy>(
-      __cpu_backend_tag{}, __first1, __last1, __first2, std::move(__init), __reduce, __transform);
+  return std::transform_reduce(__first1, __last1, __first2, std::move(__init), __reduce, __transform);
 }
 
 //===----------------------------------------------------------------------===//
@@ -92,7 +90,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
 _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator __first,
     _ForwardIterator __last,
     _Tp __init,
@@ -101,20 +99,17 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
   // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
-#  if _LIBCPP_STD_VER <= 17
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
-#  endif
                 (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
                  __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);
   }
-  return std::__pstl_transform_reduce<_ExecutionPolicy>(
-      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
+  // If it is not safe to offload to the GPU, we call the serial implementation
+  return std::transform_reduce(__first, __last, std::move(__init), __reduce, __transform);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index e7fb4f423079333..21334e797b89632 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -34,8 +34,7 @@
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_SERIAL
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
-#cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD
-#cmakedefine _LIBCPP_PSTL_GPU_OFFLOAD
+#cmakedefine _LIBCPP_PSTL_BACKEND_OPENMP
 
 // Hardening.
 #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT

>From a3aaf5abaa2eedb081d194245c15d3e6f1bf5b05 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 6 Oct 2023 12:39:29 -0700
Subject: [PATCH 12/24] Configured PSTL OpenMP backend to use the serial CPU
 backend when it is not safe to offload

---
 libcxx/include/__algorithm/pstl_backend.h      | 18 +++---------------
 .../__algorithm/pstl_backends/cpu_backend.h    |  5 -----
 .../pstl_backends/cpu_backends/backend.h       |  4 +---
 .../include/__algorithm/pstl_backends/openmp.h |  6 ++----
 .../__algorithm/pstl_backends/openmp/any_of.h  |  3 ++-
 .../__algorithm/pstl_backends/openmp/fill.h    |  3 ++-
 .../__algorithm/pstl_backends/openmp/find_if.h |  3 ++-
 .../pstl_backends/openmp/for_each.h            |  9 +++++----
 .../__algorithm/pstl_backends/openmp/merge.h   |  4 +++-
 .../pstl_backends/openmp/stable_sort.h         |  3 ++-
 .../pstl_backends/openmp/transform.h           | 12 +++++-------
 .../pstl_backends/openmp/transform_reduce.h    | 12 +++++-------
 12 files changed, 32 insertions(+), 50 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index 80c9db58e983458..dbd6f4dae8cbefe 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -179,9 +179,6 @@ implemented, all the algorithms will eventually forward to the basis algorithms
 template <class _ExecutionPolicy>
 struct __select_backend;
 
-#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                          \
-      defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
-
 template <>
 struct __select_backend<std::execution::sequenced_policy> {
   using type = __cpu_backend_tag;
@@ -194,6 +191,9 @@ struct __select_backend<std::execution::unsequenced_policy> {
 };
 #  endif
 
+#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                          \
+      defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+
 template <>
 struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
@@ -206,18 +206,6 @@ struct __select_backend<std::execution::parallel_unsequenced_policy> {
 
 #  elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 
-template <>
-struct __select_backend<std::execution::sequenced_policy> {
-  using type = __omp_backend_tag;
-};
-
-#    if _LIBCPP_STD_VER >= 20
-template <>
-struct __select_backend<std::execution::unsequenced_policy> {
-  using type = __omp_backend_tag;
-};
-#    endif
-
 template <>
 struct __select_backend<std::execution::parallel_policy> {
   using type = __omp_backend_tag;
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
index 301ea391b33b73a..50876ed2532eb41 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
@@ -46,9 +46,6 @@
   TODO: Document the parallel backend
 */
 
-#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                            \
-    defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
-
 #  include <__algorithm/pstl_backends/cpu_backends/any_of.h>
 #  include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #  include <__algorithm/pstl_backends/cpu_backends/fill.h>
@@ -59,6 +56,4 @@
 #  include <__algorithm/pstl_backends/cpu_backends/transform.h>
 #  include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
 
-#endif
-
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
index b8e9b1e28201d16..51aa878d734514d 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
@@ -12,14 +12,12 @@
 #include <__config>
 #include <cstddef>
 
-#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL)
+#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 #  include <__algorithm/pstl_backends/cpu_backends/serial.h>
 #elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD)
 #  include <__algorithm/pstl_backends/cpu_backends/thread.h>
 #elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 #  include <__algorithm/pstl_backends/cpu_backends/libdispatch.h>
-#elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
-// OpenMP covers both CPU and GPU backends
 #else
 #  error "Invalid CPU backend choice"
 #endif
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index 7787c82ff98825a..4c121e187ae246c 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -9,11 +9,10 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
 
-#include <__config>
+#  include <__config>
 
-#include <__algorithm/pstl_backends/openmp/backend.h>
+#  include <__algorithm/pstl_backends/openmp/backend.h>
 
-#if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 #  include <__algorithm/pstl_backends/openmp/any_of.h>
 #  include <__algorithm/pstl_backends/openmp/fill.h>
 #  include <__algorithm/pstl_backends/openmp/find_if.h>
@@ -22,6 +21,5 @@
 #  include <__algorithm/pstl_backends/openmp/stable_sort.h>
 #  include <__algorithm/pstl_backends/openmp/transform.h>
 #  include <__algorithm/pstl_backends/openmp/transform_reduce.h>
-#endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index 2b9a88fc58edcdd..a36f545d5bd719b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -11,6 +11,7 @@
 
 #include <__algorithm/any_of.h>
 #include <__algorithm/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__atomic/atomic.h>
 #include <__atomic/memory_order.h>
@@ -30,7 +31,7 @@ template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI bool
 __pstl_any_of(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement GPU backend
-  return std::any_of(__first, __last, __pred);
+  return std::__pstl_any_of<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 56341b04f0e50df..6098769a4fedb2d 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FILL_H
 
 #include <__algorithm/fill.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
@@ -39,7 +40,7 @@ __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   }
   // Otherwise, we execute fill on the CPU instead
   else {
-    std::fill(__first, __last, __value);
+    std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
   }
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 1e78cb67cd75978..6cd3b808e0700cc 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FIND_IF_H
 
 #include <__algorithm/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__atomic/atomic.h>
 #include <__config>
@@ -33,7 +34,7 @@ template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI _ForwardIterator
 __pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement the GPU backend
-  return std::find_if(__first, __last, __pred);
+  return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index 401cdfade50a103..c1a159829989bb1 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FOR_EACH_H
 
 #include <__algorithm/for_each.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
@@ -35,10 +36,10 @@ __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
-  } else {
-    // If it is not safe to offload to the GPU, we call the serial
-    // implementation
-    std::for_each(__first, __last, __func);
+  }
+  // Else we fall back to the serial backend
+  else {
+    std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
   }
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
index 23036e1cc853980..b6587b161d15488 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_MERGE_H
 
 #include <__algorithm/merge.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
@@ -39,7 +40,8 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
     _ForwardOutIterator __result,
     _Comp __comp) {
   // TODO: Implement GPU backend
-  return std::merge(__first1, __last1, __first2, __last2, __result, __comp);
+  return std::__pstl_merge<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first1, __last1, __first2, __last2, __result, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index 7d28b8de77983a7..ac9323b04f63f90 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
 
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/stable_sort.h>
 #include <__config>
@@ -27,7 +28,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_stable_sort(__omp_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  std::stable_sort(__first, __last, __comp);
+  __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index 6dc7ab22c98cc35..f855c5146e0f39b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
 
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/transform.h>
 #include <__config>
@@ -38,15 +39,12 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
-#  if _LIBCPP_STD_VER <= 17
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
-#  endif
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
   }
-  // If it is not safe to offload to the GPU, we rely on the serial backend.
-  return std::transform(__first, __last, __result, __op);
+  // If it is not safe to offload to the GPU, we rely on the CPU backend.
+  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
 }
 
 template <class _ExecutionPolicy,
@@ -71,8 +69,8 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
   }
-  // If it is not safe to offload to the GPU, we call the serial implementation
-  return std::transform(__first1, __last1, __first2, __result, __op);
+  // If it is not safe to offload to the GPU, we rely on the CPU backend.
+  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index c63e456b314c1c8..b34d2be79921a71 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
 
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__functional/operations.h>
@@ -72,16 +73,13 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
-#  if _LIBCPP_STD_VER <= 17
-                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
-#  endif
                 (__is_trivial_plus_operation<_BinaryOperation1, _Tp, _Tp>::value ||
                  __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
   }
-  return std::transform_reduce(__first1, __last1, __first2, std::move(__init), __reduce, __transform);
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first1, __last1, __first2, std::move(__init), __reduce, __transform);
 }
 
 //===----------------------------------------------------------------------===//
@@ -104,8 +102,8 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);
   }
-  // If it is not safe to offload to the GPU, we call the serial implementation
-  return std::transform_reduce(__first, __last, std::move(__init), __reduce, __transform);
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
 }
 
 _LIBCPP_END_NAMESPACE_STD

>From e93331fa79dda22986ab4f4bc2a356d934fa428a Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 9 Oct 2023 11:48:58 -0700
Subject: [PATCH 13/24] Clang-formatted cpu_backend.h and openmp.h

---
 .../__algorithm/pstl_backends/cpu_backend.h   | 18 ++++++++---------
 .../__algorithm/pstl_backends/openmp.h        | 20 +++++++++----------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
index 50876ed2532eb41..e54f331b9430b68 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
@@ -46,14 +46,14 @@
   TODO: Document the parallel backend
 */
 
-#  include <__algorithm/pstl_backends/cpu_backends/any_of.h>
-#  include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#  include <__algorithm/pstl_backends/cpu_backends/fill.h>
-#  include <__algorithm/pstl_backends/cpu_backends/find_if.h>
-#  include <__algorithm/pstl_backends/cpu_backends/for_each.h>
-#  include <__algorithm/pstl_backends/cpu_backends/merge.h>
-#  include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
-#  include <__algorithm/pstl_backends/cpu_backends/transform.h>
-#  include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
+#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/cpu_backends/fill.h>
+#include <__algorithm/pstl_backends/cpu_backends/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/for_each.h>
+#include <__algorithm/pstl_backends/cpu_backends/merge.h>
+#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
+#include <__algorithm/pstl_backends/cpu_backends/transform.h>
+#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index 4c121e187ae246c..6afc43f1faf3567 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -9,17 +9,17 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
 
-#  include <__config>
+#include <__config>
 
-#  include <__algorithm/pstl_backends/openmp/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 
-#  include <__algorithm/pstl_backends/openmp/any_of.h>
-#  include <__algorithm/pstl_backends/openmp/fill.h>
-#  include <__algorithm/pstl_backends/openmp/find_if.h>
-#  include <__algorithm/pstl_backends/openmp/for_each.h>
-#  include <__algorithm/pstl_backends/openmp/merge.h>
-#  include <__algorithm/pstl_backends/openmp/stable_sort.h>
-#  include <__algorithm/pstl_backends/openmp/transform.h>
-#  include <__algorithm/pstl_backends/openmp/transform_reduce.h>
+#include <__algorithm/pstl_backends/openmp/any_of.h>
+#include <__algorithm/pstl_backends/openmp/fill.h>
+#include <__algorithm/pstl_backends/openmp/find_if.h>
+#include <__algorithm/pstl_backends/openmp/for_each.h>
+#include <__algorithm/pstl_backends/openmp/merge.h>
+#include <__algorithm/pstl_backends/openmp/stable_sort.h>
+#include <__algorithm/pstl_backends/openmp/transform.h>
+#include <__algorithm/pstl_backends/openmp/transform_reduce.h>
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H

>From 9ea00c5718d57aa30f634743cf703342e4a105bf Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 9 Oct 2023 17:08:00 -0700
Subject: [PATCH 14/24] Removing unnecessary includes and redundant iterator
 checks

---
 libcxx/include/__algorithm/pstl_backend.h       |  2 +-
 .../include/__algorithm/pstl_backends/openmp.h  |  6 +++---
 .../__algorithm/pstl_backends/openmp/any_of.h   |  8 --------
 .../__algorithm/pstl_backends/openmp/fill.h     |  9 ++-------
 .../__algorithm/pstl_backends/openmp/find_if.h  |  8 --------
 .../__algorithm/pstl_backends/openmp/for_each.h |  4 ----
 .../__algorithm/pstl_backends/openmp/merge.h    |  4 ----
 .../pstl_backends/openmp/stable_sort.h          |  2 --
 .../pstl_backends/openmp/transform.h            |  6 ------
 .../pstl_backends/openmp/transform_reduce.h     | 17 +++++------------
 10 files changed, 11 insertions(+), 55 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index dbd6f4dae8cbefe..14067351c6acbd4 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -208,7 +208,7 @@ struct __select_backend<std::execution::parallel_unsequenced_policy> {
 
 template <>
 struct __select_backend<std::execution::parallel_policy> {
-  using type = __omp_backend_tag;
+  using type = __cpu_backend_tag;
 };
 
 template <>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index 6afc43f1faf3567..9e5490db655f4bd 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_H
 
 #include <__config>
 
@@ -22,4 +22,4 @@
 #include <__algorithm/pstl_backends/openmp/transform.h>
 #include <__algorithm/pstl_backends/openmp/transform_reduce.h>
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index a36f545d5bd719b..beae485f41f248d 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -10,18 +10,10 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_ANY_OF_H
 
 #include <__algorithm/any_of.h>
-#include <__algorithm/find_if.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
-#include <__atomic/atomic.h>
-#include <__atomic/memory_order.h>
 #include <__config>
-#include <__functional/operations.h>
-#include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__utility/pair.h>
-#include <__utility/terminate_on_exception.h>
-#include <cstdint>
 
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 6098769a4fedb2d..19a3ba3f7be07bd 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -13,11 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
-#include <__iterator/concepts.h>
-#include <__iterator/iterator_traits.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__utility/terminate_on_exception.h>
-#include <stdio.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -31,10 +27,9 @@ template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   // It is only safe to execute fill on the GPU, it the execution policy is
-  // parallel unsequenced, as it is the only execution policy prohibiting throwing
-  // exceptions and allowing SIMD instructions
+  // parallel unsequenced, as it is the only execution policy allowing
+  // SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 6cd3b808e0700cc..4526d3976567a07 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -12,15 +12,7 @@
 #include <__algorithm/find_if.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
-#include <__atomic/atomic.h>
 #include <__config>
-#include <__functional/operations.h>
-#include <__iterator/concepts.h>
-#include <__iterator/iterator_traits.h>
-#include <__type_traits/is_execution_policy.h>
-#include <__utility/pair.h>
-#include <__utility/terminate_on_exception.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index c1a159829989bb1..30c2e5cca2f2649 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -13,10 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
-#include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__utility/terminate_on_exception.h>
-#include <stdio.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -33,7 +30,6 @@ __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
index b6587b161d15488..666c3a0e0001933 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
@@ -13,10 +13,6 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
-#include <__iterator/concepts.h>
-#include <__type_traits/is_execution_policy.h>
-#include <__utility/move.h>
-#include <__utility/terminate_on_exception.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index ac9323b04f63f90..46e48dd279e89bd 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -13,8 +13,6 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/stable_sort.h>
 #include <__config>
-#include <__type_traits/is_execution_policy.h>
-#include <__utility/terminate_on_exception.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index f855c5146e0f39b..e507a16eac965b0 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -14,11 +14,7 @@
 #include <__algorithm/transform.h>
 #include <__config>
 #include <__iterator/concepts.h>
-#include <__iterator/iterator_traits.h>
-#include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__type_traits/remove_cvref.h>
-#include <__utility/terminate_on_exception.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -38,7 +34,6 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
@@ -64,7 +59,6 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index b34d2be79921a71..eb0a7ba52d8fdcc 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -12,17 +12,12 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
-#include <__functional/operations.h>
 #include <__iterator/concepts.h>
-#include <__iterator/iterator_traits.h>
 #include <__numeric/transform_reduce.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/operation_traits.h>
-#include <__utility/move.h>
-#include <__utility/terminate_on_exception.h>
-#include <new>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -33,13 +28,13 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _T1, class _T2, class _T3>
-struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction : std::false_type {};
+struct __is_supported_reduction : std::false_type {};
 
 #  define __PSTL_IS_SUPPORTED_REDUCTION(funname)                                                                       \
     template <class _Tp>                                                                                               \
-    struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};            \
+    struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};                                  \
     template <class _Tp, class _Up>                                                                                    \
-    struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+    struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
 
 // __is_trivial_plus_operation already exists
 __PSTL_IS_SUPPORTED_REDUCTION(plus)
@@ -73,8 +68,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
-                (__is_trivial_plus_operation<_BinaryOperation1, _Tp, _Tp>::value ||
-                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)) {
+                __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
   }
@@ -97,8 +91,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
   // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
-                (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
-                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)) {
+                __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);
   }

>From 1adc6aaa4b9bd8a44b2e68bbd6878131dde2b5d6 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 16 Oct 2023 01:06:48 -0700
Subject: [PATCH 15/24] Adding optional to OpenMP backend and other
 refactorings requested during code review

---
 .../__algorithm/pstl_backends/openmp/any_of.h |   3 +-
 .../__algorithm/pstl_backends/openmp/fill.h   |  32 ++-
 .../pstl_backends/openmp/find_if.h            |  39 +++-
 .../pstl_backends/openmp/for_each.h           |  33 +++-
 .../__algorithm/pstl_backends/openmp/merge.h  |   3 +-
 .../pstl_backends/openmp/omp_offload.h        | 185 +++---------------
 .../pstl_backends/openmp/stable_sort.h        |   6 +-
 .../pstl_backends/openmp/transform.h          |  93 ++++++++-
 .../pstl_backends/openmp/transform_reduce.h   |  16 +-
 9 files changed, 225 insertions(+), 185 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index beae485f41f248d..65f2294ff2ee5fe 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -14,13 +14,14 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI bool
+_LIBCPP_HIDE_FROM_ABI optional<bool>
 __pstl_any_of(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement GPU backend
   return std::__pstl_any_of<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 19a3ba3f7be07bd..250e514b4d526fd 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -14,6 +14,8 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
+#include <__utility/empty.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -23,19 +25,43 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+template <class _Tp, class _DifferenceType, class _Up>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd_val(
+    _Tp* __first, _DifferenceType __n, const _Up& __value) noexcept {
+  __omp_gpu_backend::__omp_map_alloc(__first, __n);
+#  pragma omp target teams distribute parallel for simd firstprivate(__value)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__first+__i) = __value;
+  __omp_gpu_backend::__omp_map_from(__first, __n);
+  return __empty{};
+}
+
+template <class _ForwardIterator, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__parallel_for_simd_val(_ForwardIterator __first, _DifferenceType __n, const _Tp& __value) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd_val(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
+}
+
+}
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
-_LIBCPP_HIDE_FROM_ABI void
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   // It is only safe to execute fill on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy allowing
   // SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+    return std::__par_backend::__parallel_for_simd_val(__first, __last - __first, __value);
   }
   // Otherwise, we execute fill on the CPU instead
   else {
-    std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
+    return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
   }
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 4526d3976567a07..94f7c42953c2a4a 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -22,11 +23,43 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+template <class _Tp, class _DifferenceType, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI _DifferenceType  __omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
+  __omp_gpu_backend::__omp_map_to(__first, __n);
+  _DifferenceType idx = __n;
+#  pragma omp target teams distribute parallel for simd reduction(min:idx)
+  for (_DifferenceType __i = 0; __i < __n; ++__i){
+    if (__pred(*(__first+__i))) {
+      idx = (__i < idx) ? __i : idx;
+    }
+  }
+  __omp_gpu_backend::__omp_map_free(__first, __n);
+  return idx;
+}
+
+// Extracting the underlying pointer
+
+template <class _ForwardIterator, class _DifferenceType, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator> __parallel_for_min_idx(_ForwardIterator __first, _DifferenceType __n, _Predicate __pred) noexcept {
+  return __first + __omp_gpu_backend::__omp_parallel_for_min_idx(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __pred);
+}
+
+}
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI _ForwardIterator
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
 __pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
-  // TODO: Implement the GPU backend
-  return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
+    return __par_backend::__parallel_for_min_idx(__first, __last - __first, __pred);
+  } else {
+    return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+  }
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index 30c2e5cca2f2649..444352b4a5e51a8 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -14,6 +14,8 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
+#include <optional>
+#include <__utility/empty.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -23,19 +25,44 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+template <class _Tp, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
+    _Tp* __first, _DifferenceType __n, _Function __f) noexcept {
+  __omp_gpu_backend::__omp_map_to(__first, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(*(__first+__i));
+  __omp_gpu_backend::__omp_map_from(__first, __n);
+  return __empty{};
+}
+
+// Extracting the underlying pointer
+
+template <class _ForwardIterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd_1(_ForwardIterator __first, _DifferenceType __n, _Function __f) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
+}
+
+}
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
-_LIBCPP_HIDE_FROM_ABI void
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
   // It is only safe to execute for_each on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+    return std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
   // Else we fall back to the serial backend
   else {
-    std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
+    return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
   }
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
index 666c3a0e0001933..8fff9125add191d 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -27,7 +28,7 @@ template <class _ExecutionPolicy,
           class _ForwardIterator2,
           class _ForwardOutIterator,
           class _Comp>
-_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_merge(
     __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index ee0bfe13bac9010..77afb35ee71b4bb 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 
 #include <__assert>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/iterator_traits.h>
@@ -19,7 +20,9 @@
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
 #include <__utility/move.h>
+#include <__utility/empty.h>
 #include <cstddef>
+#include <optional>
 
 // is_same
 
@@ -39,13 +42,15 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
+//===----------------------------------------------------------------------===//
 // Functions for eaxtracting the pase pointers
+//===----------------------------------------------------------------------===//
 
 // In the general case we do not need to extract it. This is for instance the
 // case for pointers.
 template <typename _Tp>
 _LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) noexcept {
-  return p;
+  return std::__unwrap_iter(p);
 }
 
 // For vectors and arrays, etc, we need to extract the underlying base pointer.
@@ -64,159 +69,29 @@ _LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w)
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_to([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
-  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
-#  pragma omp target enter data map(to : p[0 : len])
-  } else {
-#  pragma omp target enter data map(to : p)
-  }
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target enter data map(to : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_from([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
-  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
-#  pragma omp target exit data map(from : p[0 : len])
-  } else {
-#  pragma omp target exit data map(release : p)
-  }
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target exit data map(from : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_alloc([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
-  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
-#  pragma omp target enter data map(alloc : p[0 : len])
-  } else {
-#  pragma omp target enter data map(to : p)
-  }
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target enter data map(alloc : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
-  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
-#  pragma omp target exit data map(release : p[0 : len])
-  } else {
-#  pragma omp target exit data map(release : p)
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Templates for one iterator
-//===----------------------------------------------------------------------===//
-
-// Applying function or lambda in a loop
-
-template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(
-    _Iterator __first, _DifferenceType __n, _Function __f, [[maybe_unused]] const int __device = 0) noexcept {
-  __omp_map_to(__first, __n);
-#  pragma omp target teams distribute parallel for simd device(__device)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(__first[__i]);
-  __omp_map_from(__first, __n);
-
-  return __first + __n;
-}
-
-// Extracting the underlying pointer
-
-template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-  __omp_parallel_for_simd_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
-  return __first + __n;
-}
-
-// Assigning a value in a loop
-
-template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
-    _Index __first, _DifferenceType __n, const _Tp& __value, [[maybe_unused]] const int __device = 0) noexcept {
-  __omp_map_alloc(__first, __n);
-#  pragma omp target teams distribute parallel for simd firstprivate(__value) device(__device)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __first[__i] = __value;
-  __omp_map_from(__first, __n);
-  return __first + __n;
-}
-
-template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index
-__parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
-  __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
-  return __first + __n;
-}
-
-//===----------------------------------------------------------------------===//
-// Templates for two iterators
-//===----------------------------------------------------------------------===//
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
-    _Iterator1 __first1,
-    _DifferenceType __n,
-    _Iterator2 __first2,
-    _Function __f,
-    [[maybe_unused]] const int __device = 0) noexcept {
-  __omp_map_alloc(__first2, __n);
-  __omp_map_to(__first1, __n);
-#  pragma omp target teams distribute parallel for simd device(__device)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first2 + __i) = __f(*(__first1 + __i));
-  __omp_map_from(__first2, __n);
-  __omp_map_free(__first1, __n);
-  return __first1 + __n;
-}
-
-// Extracting the underlying pointer
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator1
-__parallel_for_simd_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
-  __omp_parallel_for_simd_2(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __n,
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __f);
-  return __first1 + __n;
-}
-
-//===----------------------------------------------------------------------===//
-// Templates for three iterator
-//===----------------------------------------------------------------------===//
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
-    _Iterator1 __first1,
-    _DifferenceType __n,
-    _Iterator2 __first2,
-    _Iterator3 __first3,
-    _Function __f,
-    [[maybe_unused]] const int __device = 0) noexcept {
-  __omp_map_to(__first1, __n);
-  __omp_map_to(__first2, __n);
-  __omp_map_alloc(__first3, __n);
-#  pragma omp target teams distribute parallel for simd device(__device)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
-  __omp_map_free(__first1, __n);
-  __omp_map_free(__first2, __n);
-  __omp_map_from(__first3, __n);
-  return __first1 + __n;
-}
-
-// Extracting the underlying pointer
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
-    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
-  __omp_parallel_for_simd_3(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __n,
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __omp_gpu_backend::__omp_extract_base_ptr(__first3),
-      __f);
-  return __first1 + __n;
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target exit data map(release : p[0 : len])
 }
 
 //===----------------------------------------------------------------------===//
@@ -237,13 +112,12 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform,                                                                                   \
-        [[maybe_unused]] const int __device = 0) noexcept {                                                            \
-      __omp_map_to(__first, __n);                                                                                      \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) device(__device))                  \
+        _UnaryOperation __transform) noexcept {                                                                         \
+      __omp_gpu_backend::__omp_map_to(__first, __n);                                                                                      \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                    \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
-      __omp_map_free(__first, __n);                                                                                    \
+      __omp_gpu_backend::__omp_map_free(__first, __n);                                                                                    \
       return __init;                                                                                                   \
     }
 
@@ -260,15 +134,14 @@ _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__in
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform,                                                                                   \
-        [[maybe_unused]] const int __device = 0) noexcept {                                                            \
-      __omp_map_to(__first1, __n);                                                                                     \
-      __omp_map_to(__first2, __n);                                                                                     \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) device(__device))                  \
+        _UnaryOperation __transform) noexcept {                                                                         \
+      __omp_gpu_backend::__omp_map_to(__first1, __n);                                                                                     \
+      __omp_gpu_backend::__omp_map_to(__first2, __n);                                                                                     \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                    \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
-      __omp_map_free(__first1, __n);                                                                                   \
-      __omp_map_free(__first2, __n);                                                                                   \
+      __omp_gpu_backend::__omp_map_free(__first1, __n);                                                                                   \
+      __omp_gpu_backend::__omp_map_free(__first2, __n);                                                                                   \
       return __init;                                                                                                   \
     } // namespace __omp_gpu_backend
 
@@ -308,9 +181,8 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
     _DifferenceType __n,
     _Tp __init,
     _BinaryOperation __reduce,
-    _UnaryOperation __transform,
-    [[maybe_unused]] const int __device = 0) noexcept {
-  return __omp_parallel_for_simd_reduction_1(
+    _UnaryOperation __transform) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd_reduction_1(
       __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
 }
 
@@ -326,9 +198,8 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_2(
     _DifferenceType __n,
     _Tp __init,
     _BinaryOperation __reduce,
-    _UnaryOperation __transform,
-    [[maybe_unused]] const int __device = 0) noexcept {
-  return __omp_parallel_for_simd_reduction_2(
+    _UnaryOperation __transform) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd_reduction_2(
       __omp_gpu_backend::__omp_extract_base_ptr(__first1),
       __omp_gpu_backend::__omp_extract_base_ptr(__first2),
       __n,
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index 46e48dd279e89bd..a4c6a2bff9f92fd 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -13,6 +13,8 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/stable_sort.h>
 #include <__config>
+#include <__utility/empty.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -23,10 +25,10 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
-_LIBCPP_HIDE_FROM_ABI void
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_stable_sort(__omp_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+  return __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index e507a16eac965b0..b29a479c17887c7 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -15,6 +15,7 @@
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -24,18 +25,92 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+//===----------------------------------------------------------------------===//
+// Templates for two iterators
+//===----------------------------------------------------------------------===//
+
+template <class _Tp, class _DifferenceType, class _Up, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
+    _Tp* __first1,
+    _DifferenceType __n,
+    _Up* __first2,
+    _Function __f) noexcept {
+  __omp_gpu_backend::__omp_map_alloc(__first2, __n);
+  __omp_gpu_backend::__omp_map_to(__first1, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__first2 + __i) = __f(*(__first1 + __i));
+  __omp_gpu_backend::__omp_map_from(__first2, __n);
+  __omp_gpu_backend::__omp_map_free(__first1, __n);
+  return __empty{};
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__parallel_for_simd(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __n,
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __f);
+}
+
+//===----------------------------------------------------------------------===//
+// Templates for three iterator
+//===----------------------------------------------------------------------===//
+
+template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
+    _Tp* __first1,
+    _DifferenceType __n,
+    _Up* __first2,
+    _Vp* __first3,
+    _Function __f) noexcept {
+  __omp_gpu_backend::__omp_map_to(__first1, __n);
+  __omp_gpu_backend::__omp_map_to(__first2, __n);
+  __omp_gpu_backend::__omp_map_alloc(__first3, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
+  __omp_gpu_backend::__omp_map_free(__first1, __n);
+  __omp_gpu_backend::__omp_map_free(__first2, __n);
+  __omp_gpu_backend::__omp_map_from(__first3, __n);
+  return __empty{};
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd(
+    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __n,
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __omp_gpu_backend::__omp_extract_base_ptr(__first3),
+      __f);
+}
+
+}
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
-_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
     __omp_backend_tag,
     _ForwardIterator __first,
     _ForwardIterator __last,
     _ForwardOutIterator __result,
     _UnaryOperation __op) {
-  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
+    std::__par_backend::__parallel_for_simd(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
@@ -48,19 +123,19 @@ template <class _ExecutionPolicy,
           class _ForwardOutIterator,
           class _BinaryOperation,
           enable_if_t<is_execution_policy_v<__remove_cvref_t<_ExecutionPolicy>>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
     __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
     _ForwardIterator2 __first2,
     _ForwardOutIterator __result,
     _BinaryOperation __op) {
-  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
+    std::__par_backend::__parallel_for_simd(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index eb0a7ba52d8fdcc..ddc3ba03f8e9b08 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -18,6 +18,7 @@
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/operation_traits.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -56,7 +57,7 @@ template <class _ExecutionPolicy,
           class _Tp,
           class _BinaryOperation1,
           class _BinaryOperation2>
-_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
+_LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
@@ -66,8 +67,10 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _BinaryOperation2 __transform) {
   // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && 
+                is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
@@ -81,16 +84,17 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
 //===----------------------------------------------------------------------===//
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
-_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
+_LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     __omp_backend_tag,
     _ForwardIterator __first,
     _ForwardIterator __last,
     _Tp __init,
     _BinaryOperation __reduce,
     _UnaryOperation __transform) {
-  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && 
+                is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);

>From d2d9de13b6645150170c4727197641314897e716 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 16 Oct 2023 08:06:44 -0700
Subject: [PATCH 16/24] Clang formatted files

---
 .../__algorithm/pstl_backends/openmp/fill.h   | 13 ++++----
 .../pstl_backends/openmp/find_if.h            | 19 ++++++-----
 .../pstl_backends/openmp/for_each.h           | 15 +++++----
 .../pstl_backends/openmp/omp_offload.h        | 32 +++++++++----------
 .../pstl_backends/openmp/transform.h          | 19 ++++-------
 .../pstl_backends/openmp/transform_reduce.h   |  6 ++--
 6 files changed, 50 insertions(+), 54 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 250e514b4d526fd..029b1877e309c16 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -29,12 +29,12 @@ namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
 template <class _Tp, class _DifferenceType, class _Up>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd_val(
-    _Tp* __first, _DifferenceType __n, const _Up& __value) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__omp_parallel_for_simd_val(_Tp* __first, _DifferenceType __n, const _Up& __value) noexcept {
   __omp_gpu_backend::__omp_map_alloc(__first, __n);
 #  pragma omp target teams distribute parallel for simd firstprivate(__value)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first+__i) = __value;
+    *(__first + __i) = __value;
   __omp_gpu_backend::__omp_map_from(__first, __n);
   return __empty{};
 }
@@ -42,11 +42,12 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd_val(
 template <class _ForwardIterator, class _DifferenceType, class _Tp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __parallel_for_simd_val(_ForwardIterator __first, _DifferenceType __n, const _Tp& __value) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd_val(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
+  return __omp_gpu_backend::__omp_parallel_for_simd_val(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
 }
 
-}
-}
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 94f7c42953c2a4a..19b581a5f72b00a 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -27,12 +27,13 @@ namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
 template <class _Tp, class _DifferenceType, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI _DifferenceType  __omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
+_LIBCPP_HIDE_FROM_ABI _DifferenceType
+__omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
   __omp_gpu_backend::__omp_map_to(__first, __n);
   _DifferenceType idx = __n;
-#  pragma omp target teams distribute parallel for simd reduction(min:idx)
-  for (_DifferenceType __i = 0; __i < __n; ++__i){
-    if (__pred(*(__first+__i))) {
+#  pragma omp target teams distribute parallel for simd reduction(min : idx)
+  for (_DifferenceType __i = 0; __i < __n; ++__i) {
+    if (__pred(*(__first + __i))) {
       idx = (__i < idx) ? __i : idx;
     }
   }
@@ -43,12 +44,14 @@ _LIBCPP_HIDE_FROM_ABI _DifferenceType  __omp_parallel_for_min_idx(_Tp* __first,
 // Extracting the underlying pointer
 
 template <class _ForwardIterator, class _DifferenceType, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator> __parallel_for_min_idx(_ForwardIterator __first, _DifferenceType __n, _Predicate __pred) noexcept {
-  return __first + __omp_gpu_backend::__omp_parallel_for_min_idx(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __pred);
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
+__parallel_for_min_idx(_ForwardIterator __first, _DifferenceType __n, _Predicate __pred) noexcept {
+  return __first +
+         __omp_gpu_backend::__omp_parallel_for_min_idx(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __pred);
 }
 
-}
-}
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index 444352b4a5e51a8..eea337179b6576c 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -14,8 +14,8 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
-#include <optional>
 #include <__utility/empty.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -29,12 +29,12 @@ namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
 template <class _Tp, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
-    _Tp* __first, _DifferenceType __n, _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__omp_parallel_for_simd(_Tp* __first, _DifferenceType __n, _Function __f) noexcept {
   __omp_gpu_backend::__omp_map_to(__first, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(*(__first+__i));
+    __f(*(__first + __i));
   __omp_gpu_backend::__omp_map_from(__first, __n);
   return __empty{};
 }
@@ -42,12 +42,13 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
 // Extracting the underlying pointer
 
 template <class _ForwardIterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd_1(_ForwardIterator __first, _DifferenceType __n, _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__parallel_for_simd_1(_ForwardIterator __first, _DifferenceType __n, _Function __f) noexcept {
   return __omp_gpu_backend::__omp_parallel_for_simd(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
 }
 
-}
-}
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 77afb35ee71b4bb..183fdfa5cfa02a5 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -9,8 +9,8 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 
-#include <__assert>
 #include <__algorithm/unwrap_iter.h>
+#include <__assert>
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/iterator_traits.h>
@@ -19,8 +19,8 @@
 #include <__memory/pointer_traits.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
-#include <__utility/move.h>
 #include <__utility/empty.h>
+#include <__utility/move.h>
 #include <cstddef>
 #include <optional>
 
@@ -70,28 +70,28 @@ template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_to([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#pragma omp target enter data map(to : p[0 : len])
+#  pragma omp target enter data map(to : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_from([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#pragma omp target exit data map(from : p[0 : len])
+#  pragma omp target exit data map(from : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_alloc([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#pragma omp target enter data map(alloc : p[0 : len])
+#  pragma omp target enter data map(alloc : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#pragma omp target exit data map(release : p[0 : len])
+#  pragma omp target exit data map(release : p[0 : len])
 }
 
 //===----------------------------------------------------------------------===//
@@ -112,12 +112,12 @@ __omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _Diffe
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform) noexcept {                                                                         \
-      __omp_gpu_backend::__omp_map_to(__first, __n);                                                                                      \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                    \
+        _UnaryOperation __transform) noexcept {                                                                        \
+      __omp_gpu_backend::__omp_map_to(__first, __n);                                                                   \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
-      __omp_gpu_backend::__omp_map_free(__first, __n);                                                                                    \
+      __omp_gpu_backend::__omp_map_free(__first, __n);                                                                 \
       return __init;                                                                                                   \
     }
 
@@ -134,14 +134,14 @@ _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__in
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform) noexcept {                                                                         \
-      __omp_gpu_backend::__omp_map_to(__first1, __n);                                                                                     \
-      __omp_gpu_backend::__omp_map_to(__first2, __n);                                                                                     \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                    \
+        _UnaryOperation __transform) noexcept {                                                                        \
+      __omp_gpu_backend::__omp_map_to(__first1, __n);                                                                  \
+      __omp_gpu_backend::__omp_map_to(__first2, __n);                                                                  \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
-      __omp_gpu_backend::__omp_map_free(__first1, __n);                                                                                   \
-      __omp_gpu_backend::__omp_map_free(__first2, __n);                                                                                   \
+      __omp_gpu_backend::__omp_map_free(__first1, __n);                                                                \
+      __omp_gpu_backend::__omp_map_free(__first2, __n);                                                                \
       return __init;                                                                                                   \
     } // namespace __omp_gpu_backend
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index b29a479c17887c7..ba218dddd4be563 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -33,11 +33,8 @@ inline namespace __omp_gpu_backend {
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
-    _Tp* __first1,
-    _DifferenceType __n,
-    _Up* __first2,
-    _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__omp_parallel_for_simd(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Function __f) noexcept {
   __omp_gpu_backend::__omp_map_alloc(__first2, __n);
   __omp_gpu_backend::__omp_map_to(__first1, __n);
 #  pragma omp target teams distribute parallel for simd
@@ -65,12 +62,8 @@ __parallel_for_simd(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
-    _Tp* __first1,
-    _DifferenceType __n,
-    _Up* __first2,
-    _Vp* __first3,
-    _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__omp_parallel_for_simd(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Vp* __first3, _Function __f) noexcept {
   __omp_gpu_backend::__omp_map_to(__first1, __n);
   __omp_gpu_backend::__omp_map_to(__first2, __n);
   __omp_gpu_backend::__omp_map_alloc(__first3, __n);
@@ -96,8 +89,8 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd(
       __f);
 }
 
-}
-}
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index ddc3ba03f8e9b08..1427184d04d0c83 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -69,8 +69,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && 
-                is_arithmetic_v<_Tp> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
@@ -93,8 +92,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     _UnaryOperation __transform) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && 
-                is_arithmetic_v<_Tp> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);

>From c485b2b00a4ccfd52a24a11f27028f55980cb984 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 19 Oct 2023 11:35:54 -0700
Subject: [PATCH 17/24] Fixing issued discussed during code review on the 16th
 of October

- Updated CMake logic
- Made `fill` call `transform` with a lambda returning a constant.
- Used `__rewrap_iter` and `__unwrap_iter`.
---
 libcxx/CMakeLists.txt                         |  13 +-
 .../pstl_backends/openmp/backend.h            |   8 +
 .../__algorithm/pstl_backends/openmp/fill.h   |  31 +---
 .../pstl_backends/openmp/find_if.h            |  27 +---
 .../pstl_backends/openmp/for_each.h           |  29 +---
 .../pstl_backends/openmp/omp_offload.h        | 144 ++----------------
 .../pstl_backends/openmp/transform.h          |  67 +++-----
 .../pstl_backends/openmp/transform_reduce.h   | 130 ++++++++++++----
 8 files changed, 163 insertions(+), 286 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 24dd562035d24ed..0fdeab59e89f767 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -295,15 +295,14 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API
   "Build libc++ with an externalized threading API.
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
 
-set(LIBCXX_PSTL_BACKEND "openmp" CACHE INTERNAL "Which PSTL backend to use")
-if (LIBCXX_PSTL_BACKEND STREQUAL "")
-  if (LIBCXX_ENABLE_THREADS)
-    set(LIBCXX_PSTL_BACKEND "std-thread")
-  else()
-    set(LIBCXX_PSTL_BACKEND "serial")
-  endif()
+if (LIBCXX_ENABLE_THREADS)
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "std-thread")
+else()
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "serial")
 endif()
 
+set(LIBCXX_PSTL_BACKEND "${LIBCXX_PSTL_BACKEND_DEFAULT}" CACHE STRING "Select the PSTL backend to use. Valid values are serial, std-thread, libdispatch, openmp. Default: ${LIBCXX_PSTL_BACKEND_DEFAULT}")
+
 # Misc options ----------------------------------------------------------------
 # FIXME: Turn -pedantic back ON. It is currently off because it warns
 # about #include_next which is used everywhere.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index e4e6136082a3427..9396f91b1a755f2 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -20,6 +20,14 @@
 
 #if _LIBCPP_STD_VER >= 17
 
+#if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+# if !defined(_OPENMP)
+//#   warning "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+# elif (defined(_OPENMP) && _OPENMP < 201511)
+//#   warning "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
+# endif
+#endif
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 struct __omp_backend_tag {};
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 029b1877e309c16..e8101233e342a3e 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -13,7 +13,9 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
+#include <__type_traits/remove_pointer.h>
 #include <__utility/empty.h>
 #include <optional>
 
@@ -25,30 +27,6 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-namespace __par_backend {
-inline namespace __omp_gpu_backend {
-
-template <class _Tp, class _DifferenceType, class _Up>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_parallel_for_simd_val(_Tp* __first, _DifferenceType __n, const _Up& __value) noexcept {
-  __omp_gpu_backend::__omp_map_alloc(__first, __n);
-#  pragma omp target teams distribute parallel for simd firstprivate(__value)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first + __i) = __value;
-  __omp_gpu_backend::__omp_map_from(__first, __n);
-  return __empty{};
-}
-
-template <class _ForwardIterator, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for_simd_val(_ForwardIterator __first, _DifferenceType __n, const _Tp& __value) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd_val(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
-}
-
-} // namespace __omp_gpu_backend
-} // namespace __par_backend
-
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
@@ -58,7 +36,10 @@ __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return std::__par_backend::__parallel_for_simd_val(__first, __last - __first, __value);
+    std::__rewrap_iter(__first, __par_backend::__omp_for_each(std::__unwrap_iter(__first), __last - __first, [&](std::remove_pointer_t<decltype(std::__unwrap_iter(__first))>& e) {
+          e = __value;
+        }));
+    return __empty{};
   }
   // Otherwise, we execute fill on the CPU instead
   else {
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 19b581a5f72b00a..f07965885c9c68c 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <__iterator/wrap_iter.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -23,13 +24,9 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-namespace __par_backend {
-inline namespace __omp_gpu_backend {
-
 template <class _Tp, class _DifferenceType, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI _DifferenceType
-__omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
-  __omp_gpu_backend::__omp_map_to(__first, __n);
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_find_if(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
+  __par_backend::__omp_map_to(__first, __n);
   _DifferenceType idx = __n;
 #  pragma omp target teams distribute parallel for simd reduction(min : idx)
   for (_DifferenceType __i = 0; __i < __n; ++__i) {
@@ -37,29 +34,17 @@ __omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred)
       idx = (__i < idx) ? __i : idx;
     }
   }
-  __omp_gpu_backend::__omp_map_free(__first, __n);
-  return idx;
-}
-
-// Extracting the underlying pointer
-
-template <class _ForwardIterator, class _DifferenceType, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
-__parallel_for_min_idx(_ForwardIterator __first, _DifferenceType __n, _Predicate __pred) noexcept {
-  return __first +
-         __omp_gpu_backend::__omp_parallel_for_min_idx(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __pred);
+  __par_backend::__omp_map_free(__first, __n);
+  return __first + idx;
 }
 
-} // namespace __omp_gpu_backend
-} // namespace __par_backend
-
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
 __pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return __par_backend::__parallel_for_min_idx(__first, __last - __first, __pred);
+    return std::__rewrap_iter(__first, std::__omp_find_if(std::__unwrap_iter(__first), __last - __first, __pred));
   } else {
     return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index eea337179b6576c..eb8caf23971cb33 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/empty.h>
 #include <optional>
@@ -25,31 +26,6 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-namespace __par_backend {
-inline namespace __omp_gpu_backend {
-
-template <class _Tp, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_parallel_for_simd(_Tp* __first, _DifferenceType __n, _Function __f) noexcept {
-  __omp_gpu_backend::__omp_map_to(__first, __n);
-#  pragma omp target teams distribute parallel for simd
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(*(__first + __i));
-  __omp_gpu_backend::__omp_map_from(__first, __n);
-  return __empty{};
-}
-
-// Extracting the underlying pointer
-
-template <class _ForwardIterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for_simd_1(_ForwardIterator __first, _DifferenceType __n, _Function __f) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
-}
-
-} // namespace __omp_gpu_backend
-} // namespace __par_backend
-
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
@@ -59,7 +35,8 @@ __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+    __par_backend::__omp_for_each(std::__unwrap_iter(__first), __last - __first, __func);
+    return __empty{};
   }
   // Else we fall back to the serial backend
   else {
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 183fdfa5cfa02a5..2e44af3c9a761f5 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -22,7 +22,6 @@
 #include <__utility/empty.h>
 #include <__utility/move.h>
 #include <cstddef>
-#include <optional>
 
 // is_same
 
@@ -43,27 +42,9 @@ namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
 //===----------------------------------------------------------------------===//
-// Functions for eaxtracting the pase pointers
-//===----------------------------------------------------------------------===//
-
-// In the general case we do not need to extract it. This is for instance the
-// case for pointers.
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) noexcept {
-  return std::__unwrap_iter(p);
-}
-
-// For vectors and arrays, etc, we need to extract the underlying base pointer.
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w) noexcept {
-  std::pointer_traits<std::__wrap_iter<_Tp>> PT;
-  return PT.to_address(w);
-}
-
-//===----------------------------------------------------------------------===//
-// The following four functions differentiates between contiguous iterators and
-// non-contiguous iterators. That allows to use the same implementations for
-// reference and value iterators
+// The following four functions can be used to map contiguous array sections to
+// and from the device. For now, they are simple overlays of the OpenMP pragmas,
+// but they should be updated wen adding support for other iterator types.
 //===----------------------------------------------------------------------===//
 
 template <class _Iterator, class _DifferenceType>
@@ -95,117 +76,18 @@ __omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _Diffe
 }
 
 //===----------------------------------------------------------------------===//
-// Templates for reductions
+// The OpenMP implementation of for_each is shared between for_each and fill
 //===----------------------------------------------------------------------===//
 
-// In the two following function templates, we map the pointer to the device in
-// different ways depending on if they are contiguou or not.
-
-#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                  \
-    template <class _Iterator,                                                                                         \
-              class _DifferenceType,                                                                                   \
-              typename _Tp,                                                                                            \
-              typename _BinaryOperationType,                                                                           \
-              typename _UnaryOperation>                                                                                \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                     \
-        _Iterator __first,                                                                                             \
-        _DifferenceType __n,                                                                                           \
-        _Tp __init,                                                                                                    \
-        std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform) noexcept {                                                                        \
-      __omp_gpu_backend::__omp_map_to(__first, __n);                                                                   \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
-        __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
-      __omp_gpu_backend::__omp_map_free(__first, __n);                                                                 \
-      return __init;                                                                                                   \
-    }
-
-#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                  \
-    template <class _Iterator1,                                                                                        \
-              class _Iterator2,                                                                                        \
-              class _DifferenceType,                                                                                   \
-              typename _Tp,                                                                                            \
-              typename _BinaryOperationType,                                                                           \
-              typename _UnaryOperation >                                                                               \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                     \
-        _Iterator1 __first1,                                                                                           \
-        _Iterator2 __first2,                                                                                           \
-        _DifferenceType __n,                                                                                           \
-        _Tp __init,                                                                                                    \
-        std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform) noexcept {                                                                        \
-      __omp_gpu_backend::__omp_map_to(__first1, __n);                                                                  \
-      __omp_gpu_backend::__omp_map_to(__first2, __n);                                                                  \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
-        __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
-      __omp_gpu_backend::__omp_map_free(__first1, __n);                                                                \
-      __omp_gpu_backend::__omp_map_free(__first2, __n);                                                                \
-      return __init;                                                                                                   \
-    } // namespace __omp_gpu_backend
-
-#  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
-    __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                        \
-    __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
-
-// Addition
-__PSTL_OMP_SIMD_REDUCTION(+, std::plus)
-
-// Subtraction
-__PSTL_OMP_SIMD_REDUCTION(-, std::minus)
-
-// Multiplication
-__PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
-
-// Logical and
-__PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
-
-// Logical or
-__PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
-
-// Bitwise and
-__PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
-
-// Bitwise or
-__PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
-
-// Bitwise xor
-__PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
-
-// Extracting the underlying pointers
-
-template <class _Iterator, class _DifferenceType, typename _Tp, typename _BinaryOperation, typename _UnaryOperation >
-_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
-    _Iterator __first,
-    _DifferenceType __n,
-    _Tp __init,
-    _BinaryOperation __reduce,
-    _UnaryOperation __transform) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd_reduction_1(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
-}
-
-template <class _Iterator1,
-          class _Iterator2,
-          class _DifferenceType,
-          typename _Tp,
-          typename _BinaryOperation,
-          typename _UnaryOperation >
-_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_2(
-    _Iterator1 __first1,
-    _Iterator2 __first2,
-    _DifferenceType __n,
-    _Tp __init,
-    _BinaryOperation __reduce,
-    _UnaryOperation __transform) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd_reduction_2(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __n,
-      __init,
-      __reduce,
-      __transform);
+template <class _Tp, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp*
+__omp_for_each(_Tp* __first, _DifferenceType __n, _Function __f) noexcept {
+  __par_backend::__omp_map_to(__first, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(*(__first + __i));
+  __par_backend::__omp_map_from(__first, __n);
+  return __first + __n;
 }
 
 } // namespace __omp_gpu_backend
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index ba218dddd4be563..f7e52d7933b6c40 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -14,6 +14,7 @@
 #include <__algorithm/transform.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
 #include <optional>
 
@@ -25,73 +26,39 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-namespace __par_backend {
-inline namespace __omp_gpu_backend {
-
 //===----------------------------------------------------------------------===//
-// Templates for two iterators
+// OpenMP implementations of transform for one and two input iterators and one
+// output iterator
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_parallel_for_simd(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Function __f) noexcept {
-  __omp_gpu_backend::__omp_map_alloc(__first2, __n);
-  __omp_gpu_backend::__omp_map_to(__first1, __n);
+__omp_transform(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Function __f) noexcept {
+  __par_backend::__omp_map_alloc(__first2, __n);
+  __par_backend::__omp_map_to(__first1, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     *(__first2 + __i) = __f(*(__first1 + __i));
-  __omp_gpu_backend::__omp_map_from(__first2, __n);
-  __omp_gpu_backend::__omp_map_free(__first1, __n);
+  __par_backend::__omp_map_from(__first2, __n);
+  __par_backend::__omp_map_free(__first1, __n);
   return __empty{};
 }
 
-// Extracting the underlying pointer
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for_simd(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __n,
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __f);
-}
-
-//===----------------------------------------------------------------------===//
-// Templates for three iterator
-//===----------------------------------------------------------------------===//
-
 template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_parallel_for_simd(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Vp* __first3, _Function __f) noexcept {
-  __omp_gpu_backend::__omp_map_to(__first1, __n);
-  __omp_gpu_backend::__omp_map_to(__first2, __n);
-  __omp_gpu_backend::__omp_map_alloc(__first3, __n);
+__omp_transform(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Vp* __first3, _Function __f) noexcept {
+  __par_backend::__omp_map_to(__first1, __n);
+  __par_backend::__omp_map_to(__first2, __n);
+  __par_backend::__omp_map_alloc(__first3, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
-  __omp_gpu_backend::__omp_map_free(__first1, __n);
-  __omp_gpu_backend::__omp_map_free(__first2, __n);
-  __omp_gpu_backend::__omp_map_from(__first3, __n);
+  __par_backend::__omp_map_free(__first1, __n);
+  __par_backend::__omp_map_free(__first2, __n);
+  __par_backend::__omp_map_from(__first3, __n);
   return __empty{};
 }
 
-// Extracting the underlying pointer
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd(
-    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __n,
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __omp_gpu_backend::__omp_extract_base_ptr(__first3),
-      __f);
-}
-
-} // namespace __omp_gpu_backend
-} // namespace __par_backend
-
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
     __omp_backend_tag,
@@ -103,7 +70,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__par_backend::__parallel_for_simd(__first, __last - __first, __result, __op);
+    std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op);
     return __result + (__last - __first);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
@@ -128,7 +95,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__par_backend::__parallel_for_simd(__first1, __last1 - __first1, __first2, __result, __op);
+    std::__omp_transform(std::__unwrap_iter(__first1), __last1 - __first1, std::__unwrap_iter(__first2), std::__unwrap_iter(__result), __op);
     return __result + (__last1 - __first1);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index 1427184d04d0c83..3798e1cb3f3fac1 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__iterator/wrap_iter.h>
 #include <__numeric/transform_reduce.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_arithmetic.h>
@@ -28,6 +29,87 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+//===----------------------------------------------------------------------===//
+// Templates for predefined reductions
+//===----------------------------------------------------------------------===//
+
+#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                  \
+    template <class _Iterator,                                                                                         \
+              class _DifferenceType,                                                                                   \
+              typename _Tp,                                                                                            \
+              typename _BinaryOperationType,                                                                           \
+              typename _UnaryOperation>                                                                                \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                     \
+        _Iterator __first,                                                                                             \
+        _DifferenceType __n,                                                                                           \
+        _Tp __init,                                                                                                    \
+        std_op<_BinaryOperationType> __reduce,                                                                         \
+        _UnaryOperation __transform) noexcept {                                                                        \
+      __par_backend::__omp_map_to(__first, __n);                                                                   \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
+        __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
+      __par_backend::__omp_map_free(__first, __n);                                                                 \
+      return __init;                                                                                                   \
+    }
+
+#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                  \
+    template <class _Iterator1,                                                                                        \
+              class _Iterator2,                                                                                        \
+              class _DifferenceType,                                                                                   \
+              typename _Tp,                                                                                            \
+              typename _BinaryOperationType,                                                                           \
+              typename _UnaryOperation >                                                                               \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                     \
+        _Iterator1 __first1,                                                                                           \
+        _Iterator2 __first2,                                                                                           \
+        _DifferenceType __n,                                                                                           \
+        _Tp __init,                                                                                                    \
+        std_op<_BinaryOperationType> __reduce,                                                                         \
+        _UnaryOperation __transform) noexcept {                                                                        \
+      __par_backend::__omp_map_to(__first1, __n);                                                                  \
+      __par_backend::__omp_map_to(__first2, __n);                                                                  \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
+        __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
+      __par_backend::__omp_map_free(__first1, __n);                                                                \
+      __par_backend::__omp_map_free(__first2, __n);                                                                \
+      return __init;                                                                                                   \
+    }
+
+#  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
+    __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                        \
+    __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
+
+// Addition
+__PSTL_OMP_SIMD_REDUCTION(+, std::plus)
+
+// Subtraction
+__PSTL_OMP_SIMD_REDUCTION(-, std::minus)
+
+// Multiplication
+__PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
+
+// Logical and
+__PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
+
+// Logical or
+__PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
+
+// Bitwise and
+__PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
+
+// Bitwise or
+__PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
+
+// Bitwise xor
+__PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
+
+//===----------------------------------------------------------------------===//
+// The following struct is used to determine whether a reduction is supported by
+// the OpenMP backend.
+//===----------------------------------------------------------------------===//
+
 template <class _T1, class _T2, class _T3>
 struct __is_supported_reduction : std::false_type {};
 
@@ -48,9 +130,28 @@ __PSTL_IS_SUPPORTED_REDUCTION(bit_or)
 __PSTL_IS_SUPPORTED_REDUCTION(bit_xor)
 
 //===----------------------------------------------------------------------===//
-// Two input iterators
+// Implementation of PSTL transform_reduce for one and two input iterators
 //===----------------------------------------------------------------------===//
 
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
+_LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
+    __omp_backend_tag,
+    _ForwardIterator __first,
+    _ForwardIterator __last,
+    _Tp __init,
+    _BinaryOperation __reduce,
+    _UnaryOperation __transform) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+                __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
+    return std::__omp_transform_reduce(
+        std::__unwrap_iter(__first), __last - __first, __init, __reduce, __transform);
+  }
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
+}
+
 template <class _ExecutionPolicy,
           class _ForwardIterator1,
           class _ForwardIterator2,
@@ -71,36 +172,13 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value) {
-    return std::__par_backend::__parallel_for_simd_reduction_2(
-        __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
+    return std::__omp_transform_reduce(
+        std::__unwrap_iter(__first1), std::__unwrap_iter(__first2), __last1 - __first1, __init, __reduce, __transform);
   }
   return std::__pstl_transform_reduce<_ExecutionPolicy>(
       __cpu_backend_tag{}, __first1, __last1, __first2, std::move(__init), __reduce, __transform);
 }
 
-//===----------------------------------------------------------------------===//
-// One input iterator
-//===----------------------------------------------------------------------===//
-
-template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
-_LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
-    __omp_backend_tag,
-    _ForwardIterator __first,
-    _ForwardIterator __last,
-    _Tp __init,
-    _BinaryOperation __reduce,
-    _UnaryOperation __transform) {
-  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
-                __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
-    return std::__par_backend::__parallel_for_simd_reduction_1(
-        __first, __last - __first, __init, __reduce, __transform);
-  }
-  return std::__pstl_transform_reduce<_ExecutionPolicy>(
-      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
-}
-
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17

>From d7eff87e04ddaa4e42c6bf3eee0e2ec7b8b77d1e Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 19 Oct 2023 17:55:42 -0700
Subject: [PATCH 18/24] Added OpenMP offloading documentation to libcxx

---
 libcxx/docs/BuildingLibcxx.rst |  11 ++++
 libcxx/docs/UsingLibcxx.rst    | 107 +++++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+)

diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst
index 2cee97c03ced089..cbd6183cbd35660 100644
--- a/libcxx/docs/BuildingLibcxx.rst
+++ b/libcxx/docs/BuildingLibcxx.rst
@@ -408,6 +408,17 @@ libc++ Feature Options
   Use the specified GCC toolchain and standard library when building the native
   stdlib benchmark tests.
 
+.. option:: LIBCXX_PSTL_BACKEND:STRING
+
+  **Default**:: ``"serial"``
+
+  **Values**:: ``serial``, ``std-thread``, ``libdispatch``, ``openmp``
+
+  Select the desired backend for C++ parallel algorithms. All four options can
+  target multi-core CPU architectures, and ``openmp`` can additionally target
+  GPU architectures. The ``openmp`` backend requires OpenMP version 4.5 or
+  later.
+
 
 libc++ ABI Feature Options
 --------------------------
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index 4d7d70e1d7901d4..8b17bb48ee9fba0 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -600,6 +600,113 @@ Unpoisoning may not be an option, if (for example) you are not maintaining the a
 * You are using allocator, which does not call destructor during deallocation.
 * You are aware that memory allocated with an allocator may be accessed, even when unused by container.
 
+Offloading C++ Parallel Algorithms to GPUs
+------------------------------------------
+
+Experimental support for GPU offloading has been added to ``libc++``. The
+implementation uses OpenMP target offloading to leverage GPU compute resources.
+The OpenMP PSTL backend can target both NVIDIA and AMD GPUs.
+However, the implementation only supports contiguous iterators, such as 
+iterators for ``std::vector`` or ``std::array``.
+To enable the OpenMP offloading backend it must be selected with
+``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when
+compiling a program, the user must specify the command line options
+``-fopenmp -fexperimental-library -stdlib=libc++``. To install LLVM with OpenMP
+offloading enabled, please read
+`the LLVM OpenMP FAQ. <https://openmp.llvm.org/SupportAndFAQ.html>`_ 
+You may also want to to visit
+`the OpenMP offloading command-line argument reference. <https://openmp.llvm.org/CommandLineArgumentReference.html#offload-command-line-arguments>`_ 
+
+Example
+~~~~~~~
+
+The following is an example of offloading vector addition to a GPU using our
+standard library extension.
+
+.. code-block:: cpp
+
+  #include <algorithm>
+  #include <execution>
+
+  template<typename T1, typename T2, typename T3>
+  void axpy(const T1 a,std::vector<T2>& x, std::vector<T3>& y)
+  {
+    std::transform(std::execution::par_unseq,x.begin(),x.end(), y.begin(), y.begin(),
+                  [=](T2 xi, T3 yi){ return a*xi + yi; });
+  }
+
+The execution policy ``std::execution::par_unseq`` states that the algorithm's
+execution may be parallelized, vectorized, and migrated across threads. This is
+the only execution mode that is safe to offload to GPUs, and for all other
+execution modes the algorithms will execute on the CPU.
+Special attention must be paid to the lambda captures when enabling GPU
+offloading. If the lambda captures by reference, the user must manually map the
+variables to the device. If capturing by reference, the above example could
+be implemented in the following way.
+
+.. code-block:: cpp
+
+  template<typename T1, typename T2, typename T3>
+  void axpy(const T1 a,std::vector<T2>& x, std::vector<T3>& y)
+  {
+  # pragma omp target data map(to:a)
+    std::transform(std::execution::par_unseq,x.begin(),x.end(), y.begin(), y.begin(),
+                  [&](T2 xi, T3 yi){ return a*xi + yi; });
+  }
+
+However, if unified shared memory, USM, is enabled, no additional data mapping
+is necessary when capturing y reference.
+
+Compiling functions for GPUs with OpenMP
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The C++ standard defines that all accesses to memory are inside a single address
+space. However, discrete GPU systems have distinct address spaces. A single
+address space can be emulated if your system supports unified shared memory.
+However, many discrete GPU systems do not, and in those cases it is important to
+pass device function pointers to the parallel algorithms. Below is an example of
+how the OpenMP `declare target` directive can be used to mark that a function
+should be compiled for both host and device. The device address of a function
+pointer can be obtained with `target map(from:<list of identifiers>)`.
+
+.. code-block:: cpp
+
+  // Declare that the function must be compiled for both host and device
+  #pragma omp declare target
+  void cube(int& n) {n*=n*n; };
+  #pragma omp end declare target
+
+  int main()
+  {
+    int * a =  new int[LEN];
+    // Initialize the array to 2 on the device
+    std::fill(std::execution::par_unseq,a, a+LEN,2);
+    // Get the device pointer for cube
+    void (*dcube)(int& n);
+    #pragma omp target map(from:dcube)
+    dcube = &cube;
+    // Pass the device function pointer to the parallel algorithm
+    std::for_each(std::execution::par_unseq,a, a+LEN,dcube);
+    // Validate that the result is 8 on the host for all array indices
+    std::for_each(std::execution::par,a, a+LEN,[&](int & n){
+      assert(n == 8);
+    });
+    delete[] a;
+    return 0;
+  }
+
+Without unified shared memory, the above example will not work if the host
+function pointer `cube` is passed to the parallel algorithm.
+
+Important notes about exception handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+GPU architectures do not support exception handling. If compiling a program
+containing parallel algorithms with ``clang`` 18 or newer, a program with
+exceptions in offloaded code regions will compile, but the program will
+terminate if an exception is thrown on the device. This does not conform with
+the C++ standard and exception handling on GPUs will hopefully be better
+supported in future releases of LLVM.
+
 Platform specific behavior
 ==========================
 

>From 62ab21932c45339016b4834d3028baeb3334e4ef Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 11:26:26 -0700
Subject: [PATCH 19/24] Changed argument names to reflect input and output

---
 .../pstl_backends/openmp/backend.h            | 15 +++---
 .../__algorithm/pstl_backends/openmp/fill.h   | 20 +++++---
 .../pstl_backends/openmp/find_if.h            |  5 +-
 .../pstl_backends/openmp/for_each.h           | 17 +++++--
 .../pstl_backends/openmp/omp_offload.h        | 21 +-------
 .../pstl_backends/openmp/transform.h          | 51 +++++++++++++------
 .../pstl_backends/openmp/transform_reduce.h   | 20 ++++----
 7 files changed, 85 insertions(+), 64 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index 9396f91b1a755f2..99245a7d13e9917 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -20,13 +20,14 @@
 
 #if _LIBCPP_STD_VER >= 17
 
-#if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
-# if !defined(_OPENMP)
-//#   warning "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
-# elif (defined(_OPENMP) && _OPENMP < 201511)
-//#   warning "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
-# endif
-#endif
+#  if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+#    if !defined(_OPENMP)
+// #   warning "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+#    elif (defined(_OPENMP) && _OPENMP < 201511)
+// #   warning "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent
+// version of OpenMP."
+#    endif
+#  endif
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index e8101233e342a3e..4262f778fa9121c 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -27,18 +27,26 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _Tp, class _DifferenceType, class _Up>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_fill(_Tp* __out1, _DifferenceType __n, const _Up& __value) noexcept {
+  __par_backend::__omp_map_alloc(__out1, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__out1 + __i) = __value;
+  __par_backend::__omp_map_from(__out1, __n);
+  return __out1 + __n;
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
-  // It is only safe to execute fill on the GPU, it the execution policy is
-  // parallel unsequenced, as it is the only execution policy allowing
-  // SIMD instructions
+  // If it is safe to offload the computations to the GPU, we call the OpenMP
+  // implementation of for_each. In the case of fill, we provide for_Each with a
+  // lambda returning a constant.
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__rewrap_iter(__first, __par_backend::__omp_for_each(std::__unwrap_iter(__first), __last - __first, [&](std::remove_pointer_t<decltype(std::__unwrap_iter(__first))>& e) {
-          e = __value;
-        }));
+    std::__rewrap_iter(__first, std::__omp_fill(std::__unwrap_iter(__first), __last - __first, __value));
     return __empty{};
   }
   // Otherwise, we execute fill on the CPU instead
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index f07965885c9c68c..8d7b196ec176f71 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -34,17 +34,20 @@ _LIBCPP_HIDE_FROM_ABI _Tp* __omp_find_if(_Tp* __first, _DifferenceType __n, _Pre
       idx = (__i < idx) ? __i : idx;
     }
   }
-  __par_backend::__omp_map_free(__first, __n);
+  __par_backend::__omp_map_release(__first, __n);
   return __first + idx;
 }
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
 __pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+  // If it is safe to offload the computations to the GPU, we call the OpenMP
+  // implementation of find_if
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     return std::__rewrap_iter(__first, std::__omp_find_if(std::__unwrap_iter(__first), __last - __first, __pred));
+    // Else we rey on the CPU PSTL backend
   } else {
     return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index eb8caf23971cb33..f64efc170537af6 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -26,16 +26,25 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _Tp, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_for_each(_Tp* __inout1, _DifferenceType __n, _Function __f) noexcept {
+  __par_backend::__omp_map_to(__inout1, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(*(__inout1 + __i));
+  __par_backend::__omp_map_from(__inout1, __n);
+  return __inout1 + __n;
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
-  // It is only safe to execute for_each on the GPU, it the execution policy is
-  // parallel unsequenced, as it is the only execution policy prohibiting throwing
-  // exceptions and allowing SIMD instructions
+  // If it is safe to offload the computations to the GPU, we call the OpenMP
+  // implementation of for_each
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    __par_backend::__omp_for_each(std::__unwrap_iter(__first), __last - __first, __func);
+    std::__omp_for_each(std::__unwrap_iter(__first), __last - __first, __func);
     return __empty{};
   }
   // Else we fall back to the serial backend
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 2e44af3c9a761f5..88ee4cdbbc4faec 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -23,10 +23,6 @@
 #include <__utility/move.h>
 #include <cstddef>
 
-// is_same
-
-// __libcpp_is_contiguous_iterator
-
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
@@ -70,26 +66,11 @@ __omp_map_alloc([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _Diff
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
-__omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+__omp_map_release([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
 #  pragma omp target exit data map(release : p[0 : len])
 }
 
-//===----------------------------------------------------------------------===//
-// The OpenMP implementation of for_each is shared between for_each and fill
-//===----------------------------------------------------------------------===//
-
-template <class _Tp, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Tp*
-__omp_for_each(_Tp* __first, _DifferenceType __n, _Function __f) noexcept {
-  __par_backend::__omp_map_to(__first, __n);
-#  pragma omp target teams distribute parallel for simd
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(*(__first + __i));
-  __par_backend::__omp_map_from(__first, __n);
-  return __first + __n;
-}
-
 } // namespace __omp_gpu_backend
 } // namespace __par_backend
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index f7e52d7933b6c40..bd795d5571f0d79 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -33,29 +33,45 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_transform(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Function __f) noexcept {
-  __par_backend::__omp_map_alloc(__first2, __n);
-  __par_backend::__omp_map_to(__first1, __n);
+__omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
+  // The order of the following maps matter, as we wish to move the data. If
+  // they were placed in the reverse order, and __in equals __out, then we would
+  // allocate the buffer on the device without copying the data.
+  __par_backend::__omp_map_to(__in1, __n);
+  __par_backend::__omp_map_alloc(__out1, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first2 + __i) = __f(*(__first1 + __i));
-  __par_backend::__omp_map_from(__first2, __n);
-  __par_backend::__omp_map_free(__first1, __n);
+    *(__out1 + __i) = __f(*(__in1 + __i));
+  // The order of the following two maps matters, since the user could legally
+  // overwrite __in The "release" map modifier decreases the reference counter
+  // by one, and "from" only moves the data to the host, when the reference
+  // count is decremented to zero.
+  __par_backend::__omp_map_release(__in1, __n);
+  __par_backend::__omp_map_from(__out1, __n);
   return __empty{};
 }
 
 template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_transform(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Vp* __first3, _Function __f) noexcept {
-  __par_backend::__omp_map_to(__first1, __n);
-  __par_backend::__omp_map_to(__first2, __n);
-  __par_backend::__omp_map_alloc(__first3, __n);
+__omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Function __f) noexcept {
+  // The order of the following maps matter, as we wish to move the data. If
+  // they were placed in the reverse order, and __out equals __in1 or __in2,
+  // then we would allocate one of the buffer on the device without copying the
+  // data.
+  __par_backend::__omp_map_to(__in1, __n);
+  __par_backend::__omp_map_to(__in2, __n);
+  __par_backend::__omp_map_alloc(__out1, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
-  __par_backend::__omp_map_free(__first1, __n);
-  __par_backend::__omp_map_free(__first2, __n);
-  __par_backend::__omp_map_from(__first3, __n);
+    *(__out1 + __i) = __f(*(__in1 + __i), *(__in2 + __i));
+  // The order of the following three maps matters, since the user could legally
+  // overwrite either of the inputs if __out equals __in1 or __in2. The
+  // "release" map modifier decreases the reference counter by one, and "from"
+  // only moves the data from the device, when the reference count is
+  // decremented to zero.
+  __par_backend::__omp_map_release(__in1, __n);
+  __par_backend::__omp_map_release(__in2, __n);
+  __par_backend::__omp_map_from(__out1, __n);
   return __empty{};
 }
 
@@ -95,7 +111,12 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__omp_transform(std::__unwrap_iter(__first1), __last1 - __first1, std::__unwrap_iter(__first2), std::__unwrap_iter(__result), __op);
+    std::__omp_transform(
+        std::__unwrap_iter(__first1),
+        __last1 - __first1,
+        std::__unwrap_iter(__first2),
+        std::__unwrap_iter(__result),
+        __op);
     return __result + (__last1 - __first1);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index 3798e1cb3f3fac1..1684c3d273b2239 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -39,17 +39,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
               typename _Tp,                                                                                            \
               typename _BinaryOperationType,                                                                           \
               typename _UnaryOperation>                                                                                \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                     \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                                  \
         _Iterator __first,                                                                                             \
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
         _UnaryOperation __transform) noexcept {                                                                        \
-      __par_backend::__omp_map_to(__first, __n);                                                                   \
+      __par_backend::__omp_map_to(__first, __n);                                                                       \
 _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
-      __par_backend::__omp_map_free(__first, __n);                                                                 \
+      __par_backend::__omp_map_release(__first, __n);                                                                  \
       return __init;                                                                                                   \
     }
 
@@ -60,20 +60,20 @@ _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__in
               typename _Tp,                                                                                            \
               typename _BinaryOperationType,                                                                           \
               typename _UnaryOperation >                                                                               \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                     \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                                  \
         _Iterator1 __first1,                                                                                           \
         _Iterator2 __first2,                                                                                           \
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
         _UnaryOperation __transform) noexcept {                                                                        \
-      __par_backend::__omp_map_to(__first1, __n);                                                                  \
-      __par_backend::__omp_map_to(__first2, __n);                                                                  \
+      __par_backend::__omp_map_to(__first1, __n);                                                                      \
+      __par_backend::__omp_map_to(__first2, __n);                                                                      \
 _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
-      __par_backend::__omp_map_free(__first1, __n);                                                                \
-      __par_backend::__omp_map_free(__first2, __n);                                                                \
+      __par_backend::__omp_map_release(__first1, __n);                                                                 \
+      __par_backend::__omp_map_release(__first2, __n);                                                                 \
       return __init;                                                                                                   \
     }
 
@@ -145,8 +145,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
-    return std::__omp_transform_reduce(
-        std::__unwrap_iter(__first), __last - __first, __init, __reduce, __transform);
+    return std::__omp_transform_reduce(std::__unwrap_iter(__first), __last - __first, __init, __reduce, __transform);
   }
   return std::__pstl_transform_reduce<_ExecutionPolicy>(
       __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
@@ -166,7 +165,6 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     _Tp __init,
     _BinaryOperation1 __reduce,
     _BinaryOperation2 __transform) {
-  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&

>From dd894b6872aea647604f6085be26d7aae98ffb0e Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 11:42:13 -0700
Subject: [PATCH 20/24] Adding compile time errors to
 pstl_backend/openmp/backend.h to validate that OpenMP is enabled

---
 libcxx/docs/UsingLibcxx.rst                               | 1 +
 libcxx/include/__algorithm/pstl_backends/openmp/backend.h | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index 8b17bb48ee9fba0..27c3983e5637941 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -659,6 +659,7 @@ is necessary when capturing y reference.
 
 Compiling functions for GPUs with OpenMP
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 The C++ standard defines that all accesses to memory are inside a single address
 space. However, discrete GPU systems have distinct address spaces. A single
 address space can be emulated if your system supports unified shared memory.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index 99245a7d13e9917..11ab0bb108b2251 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -22,10 +22,9 @@
 
 #  if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 #    if !defined(_OPENMP)
-// #   warning "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+#   error "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
 #    elif (defined(_OPENMP) && _OPENMP < 201511)
-// #   warning "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent
-// version of OpenMP."
+#   error "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
 #    endif
 #  endif
 

>From d282ab15cf55fb06e9c6ab81d6e8aad3a6930559 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 11:58:27 -0700
Subject: [PATCH 21/24] Clang-formatted error message in backend.h

---
 libcxx/include/__algorithm/pstl_backends/openmp/backend.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index 11ab0bb108b2251..eb5e40d6f20a94d 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -22,9 +22,10 @@
 
 #  if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 #    if !defined(_OPENMP)
-#   error "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+#      error "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
 #    elif (defined(_OPENMP) && _OPENMP < 201511)
-#   error "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
+#      error                                                                                                           \
+          "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
 #    endif
 #  endif
 

>From 4d41118c6b9f7f8f27240a6bab715afba9fa4f47 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 12:48:16 -0700
Subject: [PATCH 22/24] Disabling ADL in pstl_backends/openmp/stable_sort.h

---
 libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index a4c6a2bff9f92fd..0b5ce39a2344a9a 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -28,7 +28,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_stable_sort(__omp_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  return __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+  return std::__pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD

>From d3abcc55d0a785663e858b3983589ea3a321dc2a Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 12:54:28 -0700
Subject: [PATCH 23/24] Added std::__rewrap_iter to std::transform

---
 .../__algorithm/pstl_backends/openmp/transform.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index bd795d5571f0d79..0338771728d9d0a 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -32,7 +32,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
+_LIBCPP_HIDE_FROM_ABI _Tp*
 __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
   // The order of the following maps matter, as we wish to move the data. If
   // they were placed in the reverse order, and __in equals __out, then we would
@@ -48,11 +48,11 @@ __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noe
   // count is decremented to zero.
   __par_backend::__omp_map_release(__in1, __n);
   __par_backend::__omp_map_from(__out1, __n);
-  return __empty{};
+  return __out1 + __n;
 }
 
 template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
+_LIBCPP_HIDE_FROM_ABI _Tp*
 __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Function __f) noexcept {
   // The order of the following maps matter, as we wish to move the data. If
   // they were placed in the reverse order, and __out equals __in1 or __in2,
@@ -72,7 +72,7 @@ __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Funct
   __par_backend::__omp_map_release(__in1, __n);
   __par_backend::__omp_map_release(__in2, __n);
   __par_backend::__omp_map_from(__out1, __n);
-  return __empty{};
+  return __out1 + __n;
 }
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
@@ -86,8 +86,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op);
-    return __result + (__last - __first);
+    std::__rewrap_iter(__result,std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op));
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
@@ -111,13 +110,12 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__omp_transform(
+    return std::__rewrap_iter(__result,std::__omp_transform(
         std::__unwrap_iter(__first1),
         __last1 - __first1,
         std::__unwrap_iter(__first2),
         std::__unwrap_iter(__result),
-        __op);
-    return __result + (__last1 - __first1);
+        __op));
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);

>From 7711c9117c84ae6e1701642b76524b15c03102df Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 13:17:56 -0700
Subject: [PATCH 24/24] Clang-formatted
 libcxx/include/__algorithm/pstl_backends/openmp/transform.h

---
 .../pstl_backends/openmp/transform.h          | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index 0338771728d9d0a..e1c5f54675b3870 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -32,8 +32,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Tp*
-__omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
   // The order of the following maps matter, as we wish to move the data. If
   // they were placed in the reverse order, and __in equals __out, then we would
   // allocate the buffer on the device without copying the data.
@@ -86,7 +85,9 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__rewrap_iter(__result,std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op));
+    std::__rewrap_iter(
+        __result,
+        std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op));
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
@@ -110,12 +111,14 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    return std::__rewrap_iter(__result,std::__omp_transform(
-        std::__unwrap_iter(__first1),
-        __last1 - __first1,
-        std::__unwrap_iter(__first2),
-        std::__unwrap_iter(__result),
-        __op));
+    return std::__rewrap_iter(
+        __result,
+        std::__omp_transform(
+            std::__unwrap_iter(__first1),
+            __last1 - __first1,
+            std::__unwrap_iter(__first2),
+            std::__unwrap_iter(__result),
+            __op));
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);