[libcxx] [compiler-rt] [llvm] [lldb] [libcxxabi] [mlir] [clang] Adding Separate OpenMP Offloading Backend to `libcxx/include/__algorithm/pstl_backends` (PR #66968)

Thu Nov 2 16:43:58 PDT 2023

https://github.com/AntonRydahl updated https://github.com/llvm/llvm-project/pull/66968

>From c0a29600c8ed408018e86603ad8ac336f1eb3721 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 20 Sep 2023 17:06:10 -0700
Subject: [PATCH 01/46] Adding OpenMP Offloading Backend for C++ Parallel
 Algorithms

---
 libcxx/CMakeLists.txt                         | 14 +++
 libcxx/include/CMakeLists.txt                 |  5 +
 libcxx/include/__algorithm/pstl_backend.h     |  8 ++
 .../__algorithm/pstl_backends/gpu_backend.h   | 21 +++++
 .../pstl_backends/gpu_backends/backend.h      | 33 +++++++
 .../pstl_backends/gpu_backends/fill.h         | 59 ++++++++++++
 .../pstl_backends/gpu_backends/for_each.h     | 59 ++++++++++++
 .../pstl_backends/gpu_backends/omp_offload.h  | 91 +++++++++++++++++++
 libcxx/include/__config_site.in               |  1 +
 9 files changed, 291 insertions(+)
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backend.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index d03421afde1e755..7abc2f50c5d9272 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -294,6 +294,8 @@ option(LIBCXX_HAS_WIN32_THREAD_API "Ignore auto-detection and force use of win32
 option(LIBCXX_HAS_EXTERNAL_THREAD_API
   "Build libc++ with an externalized threading API.
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
+option(LIBCXX_ENABLE_GPU_OFFLOAD 
+  "Build libc++ with support for GPU offload" OFF)
 
 if (LIBCXX_ENABLE_THREADS)
   set(LIBCXX_PSTL_CPU_BACKEND "std_thread" CACHE STRING "Which PSTL CPU backend to use")
@@ -301,6 +303,14 @@ else()
   set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
 endif()
 
+if (NOT DEFINED LIBCXX_PSTL_GPU_BACKEND)
+  if (${LIBCXX_ENABLE_GPU_OFFLOAD})
+    set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
+  else()
+    set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
+  endif()
+endif()
+
 # Misc options ----------------------------------------------------------------
 # FIXME: Turn -pedantic back ON. It is currently off because it warns
 # about #include_next which is used everywhere.
@@ -783,6 +793,10 @@ else()
                        Valid backends are: serial, std_thread and libdispatch")
 endif()
 
+if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload")
+  config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+endif()
+
 if (LIBCXX_ABI_DEFINES)
   set(abi_defines)
   foreach (abi_define ${LIBCXX_ABI_DEFINES})
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index b7b14200498a298..7e524cfb4f72ae5 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -85,6 +85,11 @@ set(files
   __algorithm/pstl_backends/cpu_backends/thread.h
   __algorithm/pstl_backends/cpu_backends/transform.h
   __algorithm/pstl_backends/cpu_backends/transform_reduce.h
+  __algorithm/pstl_backends/gpu_backend.h
+  __algorithm/pstl_backends/gpu_backends/backend.h
+  __algorithm/pstl_backends/gpu_backends/fill.h
+  __algorithm/pstl_backends/gpu_backends/for_each.h
+  __algorithm/pstl_backends/gpu_backends/omp_offload.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index dcb1c99e7962e63..aeba1cdf8dd9f05 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKEND_H
 
 #include <__algorithm/pstl_backends/cpu_backend.h>
+#include <__algorithm/pstl_backends/gpu_backend.h>
 #include <__config>
 #include <execution>
 
@@ -210,10 +211,17 @@ struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
+#   if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+template <>
+struct __select_backend<std::execution::parallel_unsequenced_policy> {
+  using type = __gpu_backend_tag;
+};
+#   else
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __cpu_backend_tag;
 };
+#   endif
 
 #  else
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
new file mode 100644
index 000000000000000..46a85f77b5deb99
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+
+#include <__config>
+
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+
+#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#include <__algorithm/pstl_backends/gpu_backends/fill.h>
+#include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#endif
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
new file mode 100644
index 000000000000000..a8b400afbb94d9d
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+
+#include <__config>
+#include <cstddef>
+
+#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#  include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+struct __gpu_backend_tag {};
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
new file mode 100644
index 000000000000000..5603e18a5d2d3fc
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+
+#include <__algorithm/fill.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+#include <stdio.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
+  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // parallel unsequenced, as it is the only execution policy prohibiting throwing
+  // exceptions and allowing SIMD instructions
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+  }
+  // Else if the excution policy is parallel, we execute for_each on the CPU instead
+  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__terminate_on_exception([&] {
+      __par_backend::__parallel_for(
+          __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+            std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{}, __brick_first, __brick_last, __value);
+          });
+    });
+  // Else we execute for_each in serial
+  } else {
+    std::fill(__first, __last, __value);
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
new file mode 100644
index 000000000000000..20486d83863f420
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+
+#include <__algorithm/for_each.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+#include <stdio.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
+  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // parallel unsequenced, as it is the only execution policy prohibiting throwing
+  // exceptions and allowing SIMD instructions
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+  }
+  // Else if the excution policy is parallel, we execute for_each on the CPU instead
+  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__terminate_on_exception([&] {
+      std::__par_backend::__parallel_for(
+          __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+            std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{}, __brick_first, __brick_last, __func);
+          });
+    });
+  // Else we execute for_each in serial
+  } else {
+    std::for_each(__first, __last, __func);
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
new file mode 100644
index 000000000000000..840118dbec5057c
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -0,0 +1,91 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+
+#include <__assert>
+#include <__config>
+#include <__utility/move.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+// In OpenMP, we need to extract the pointer for the underlying data for data
+// structures like std::vector and std::array to be able to map the data to the
+// device.
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(T p) {
+  return p;
+}
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
+  std::pointer_traits<std::__wrap_iter<T>> PT;
+  return PT.to_address(w);
+}
+
+// Applying function or lambda in a loop
+
+template <class _Iterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n])
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__first[__i]);
+
+  return __first + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+  __omp_parallel_for_simd_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
+  return __first + __n;
+}
+
+// Assigning a value in a loop
+
+template <class _Index, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n]) map(to:__value)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __first[__i] = __value;
+
+  return __first + __n;
+}
+
+template <class _Index, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI _Index __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+  __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
+  return __first + __n;
+}
+
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index c85cbcd02c441b9..e0edddce3afc3ff 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -34,6 +34,7 @@
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_SERIAL
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
+#cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD
 
 // Hardening.
 #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT

>From 437af1cc2bd74e780eaf627c8e623a13349b6a83 Mon Sep 17 00:00:00 2001
From: antonrydahl <rydahl2610 at gmail.com>
Date: Wed, 20 Sep 2023 17:48:25 -0700
Subject: [PATCH 02/46] Clang formatting OpenMP backend for parallel algorithms

---
 libcxx/include/__algorithm/pstl_backend.h           |  6 +++---
 .../include/__algorithm/pstl_backends/gpu_backend.h |  4 ++--
 .../__algorithm/pstl_backends/gpu_backends/fill.h   | 12 ++++++------
 .../pstl_backends/gpu_backends/for_each.h           | 12 ++++++------
 .../pstl_backends/gpu_backends/omp_offload.h        | 13 ++++++++-----
 5 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index aeba1cdf8dd9f05..06e0790a283d551 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -211,17 +211,17 @@ struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
-#   if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#    if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __gpu_backend_tag;
 };
-#   else
+#    else
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __cpu_backend_tag;
 };
-#   endif
+#    endif
 
 #  else
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index 46a85f77b5deb99..7237036156a1bf3 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -14,8 +14,8 @@
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 
 #if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
-#include <__algorithm/pstl_backends/gpu_backends/fill.h>
-#include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#  include <__algorithm/pstl_backends/gpu_backends/fill.h>
+#  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
 #endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index 5603e18a5d2d3fc..32926da87e2a083 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
 
 #include <__algorithm/fill.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
@@ -29,16 +29,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
-  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // It is only safe to execute for_each on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
   }
   // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__terminate_on_exception([&] {
       __par_backend::__parallel_for(
           __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
@@ -46,7 +46,7 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
                 __cpu_backend_tag{}, __brick_first, __brick_last, __value);
           });
     });
-  // Else we execute for_each in serial
+    // Else we execute for_each in serial
   } else {
     std::fill(__first, __last, __value);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index 20486d83863f420..14de2af8e4a15c6 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
 
 #include <__algorithm/for_each.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
@@ -29,16 +29,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
-  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // It is only safe to execute for_each on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
   // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__terminate_on_exception([&] {
       std::__par_backend::__parallel_for(
           __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
@@ -46,7 +46,7 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
                 __cpu_backend_tag{}, __brick_first, __brick_last, __func);
           });
     });
-  // Else we execute for_each in serial
+    // Else we execute for_each in serial
   } else {
     std::for_each(__first, __last, __func);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 840118dbec5057c..4baa4e7f65859d1 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -46,8 +46,9 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
 // Applying function or lambda in a loop
 
 template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n])
+_LIBCPP_HIDE_FROM_ABI _Iterator
+__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n])
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
 
@@ -65,8 +66,9 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
 // Assigning a value in a loop
 
 template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
-  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n]) map(to:__value)
+_LIBCPP_HIDE_FROM_ABI _Index
+__omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) map(to : __value)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __first[__i] = __value;
 
@@ -74,7 +76,8 @@ _LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _Diff
 }
 
 template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+_LIBCPP_HIDE_FROM_ABI _Index
+__parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
   __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
   return __first + __n;
 }

>From 61b774396cc2677bac747718c1775273fa768f56 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 21 Sep 2023 12:50:17 -0700
Subject: [PATCH 03/46] Making PSTL GPU backend depend on CMake options rather
 than command line options

---
 libcxx/CMakeLists.txt                                 | 11 +++++------
 libcxx/include/__algorithm/pstl_backend.h             |  2 +-
 .../include/__algorithm/pstl_backends/gpu_backend.h   |  2 +-
 .../__algorithm/pstl_backends/gpu_backends/backend.h  |  8 ++++++--
 libcxx/include/__config_site.in                       |  1 +
 5 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 7abc2f50c5d9272..b29dc867ca1f2c7 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -303,12 +303,10 @@ else()
   set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
 endif()
 
-if (NOT DEFINED LIBCXX_PSTL_GPU_BACKEND)
-  if (${LIBCXX_ENABLE_GPU_OFFLOAD})
-    set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
-  else()
-    set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
-  endif()
+if (${LIBCXX_ENABLE_GPU_OFFLOAD})
+  set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
+else()
+  set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
 endif()
 
 # Misc options ----------------------------------------------------------------
@@ -793,6 +791,7 @@ else()
                        Valid backends are: serial, std_thread and libdispatch")
 endif()
 
+config_define_if(LIBCXX_ENABLE_GPU_OFFLOAD _LIBCPP_PSTL_GPU_OFFLOAD)
 if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload")
   config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
 endif()
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index 06e0790a283d551..e7e7244d5e9666d 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -211,7 +211,7 @@ struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
-#    if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#    if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __gpu_backend_tag;
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index 7237036156a1bf3..d2a814b441224a5 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -13,7 +13,7 @@
 
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 
-#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 #  include <__algorithm/pstl_backends/gpu_backends/fill.h>
 #  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
 #endif
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
index a8b400afbb94d9d..a03ad35d8d2ae3e 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
@@ -12,8 +12,12 @@
 #include <__config>
 #include <cstddef>
 
-#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
-#  include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
+#  if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#    include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#  else
+#    error Invalid PSTL GPU backend
+#  endif
 #endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index e0edddce3afc3ff..e7fb4f423079333 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -35,6 +35,7 @@
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
 #cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD
+#cmakedefine _LIBCPP_PSTL_GPU_OFFLOAD
 
 // Hardening.
 #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT

>From 958b25b9b2c20bb21a5c078508291d32d2501e13 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 21 Sep 2023 17:07:58 -0700
Subject: [PATCH 04/46] Added OpenMP offloaded version of std::transform

---
 libcxx/include/CMakeLists.txt                 |   1 +
 .../__algorithm/pstl_backends/gpu_backend.h   |   1 +
 .../pstl_backends/gpu_backends/omp_offload.h  | 119 +++++++++++++++++-
 .../pstl_backends/gpu_backends/transform.h    | 117 +++++++++++++++++
 4 files changed, 233 insertions(+), 5 deletions(-)
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 7e524cfb4f72ae5..f34928da1b82a39 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -90,6 +90,7 @@ set(files
   __algorithm/pstl_backends/gpu_backends/fill.h
   __algorithm/pstl_backends/gpu_backends/for_each.h
   __algorithm/pstl_backends/gpu_backends/omp_offload.h
+  __algorithm/pstl_backends/gpu_backends/transform.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index d2a814b441224a5..dac26592dac5c1f 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -16,6 +16,7 @@
 #if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 #  include <__algorithm/pstl_backends/gpu_backends/fill.h>
 #  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#  include <__algorithm/pstl_backends/gpu_backends/transform.h>
 #endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 4baa4e7f65859d1..69221cbb8519233 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -28,6 +28,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
+// Checking if a pointer is in a range
+template <typename T1, typename T2, typename T3>
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1 a, T2 p, T3 b) {
+  return false;
+}
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T* a, T* p, T* b) {
+  return std::less_equal<T*>{}(a, p) && std::less<T*>{}(p, b);
+}
+
 // In OpenMP, we need to extract the pointer for the underlying data for data
 // structures like std::vector and std::array to be able to map the data to the
 // device.
@@ -43,12 +54,16 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
   return PT.to_address(w);
 }
 
+//===----------------------------------------------------------------------===//
+// Templates for one iterator
+//===----------------------------------------------------------------------===//
+
 // Applying function or lambda in a loop
 
 template <class _Iterator, class _DifferenceType, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator
-__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n])
+__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f, const int __device = 0) noexcept {
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
 
@@ -66,9 +81,10 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
 // Assigning a value in a loop
 
 template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index
-__omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
-#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) map(to : __value)
+_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
+    _Index __first, _DifferenceType __n, const _Tp& __value, const int __device = 0) noexcept {
+#  pragma omp target teams distribute parallel for simd map(from : __first[0 : __n]) map(always, to : __value)         \
+      device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __first[__i] = __value;
 
@@ -82,6 +98,99 @@ __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __valu
   return __first + __n;
 }
 
+//===----------------------------------------------------------------------===//
+// Templates for two iterators
+//===----------------------------------------------------------------------===//
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
+    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f, const int __device = 0) noexcept {
+  if ((!std::is_same<_Iterator1, _Iterator2>::value) ||
+      (std::is_same<_Iterator1, _Iterator2>::value &&
+       !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first2, __first1 + __n))) {
+#  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n]) map(from : __first2[0 : __n])      \
+      device(__device)
+    for (_DifferenceType __i = 0; __i < __n; ++__i)
+      __f(__first1[__i], __first2[__i]);
+    return __first1 + __n;
+  }
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first1[0 : __n], __first2[0 : __n])             \
+      device(__device)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__first1[__i], __first2[__i]);
+
+  return __first1 + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1
+__parallel_for_simd_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
+  __omp_parallel_for_simd_2(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __n,
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __f);
+  return __first1 + __n;
+}
+
+//===----------------------------------------------------------------------===//
+// Templates for three iterator
+//===----------------------------------------------------------------------===//
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
+    _Iterator1 __first1,
+    _DifferenceType __n,
+    _Iterator2 __first2,
+    _Iterator3 __first3,
+    _Function __f,
+    const int __device = 0) noexcept {
+  // It may be that __first3 is in the interval [__first1+__n) or [__firt2+__n)
+  // It is, however, undefined behavior to compare two pointers that do not
+  // point to the same object or are not the same type.
+  // If we can prove that __first3 is not in any of the ranges [__first1+__n)
+  // or [__firt2+__n), it is safe to reduce the amount of data copied to and
+  // from the device
+  constexpr bool are_not_same_type =
+      !std::is_same<_Iterator1, _Iterator2>::value && !std::is_same<_Iterator1, _Iterator3>::value;
+  const bool no_overlap_13 =
+      std::is_same<_Iterator1, _Iterator3>::value &&
+      !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first3, __first1 + __n);
+  const bool no_overlap_23 =
+      std::is_same<_Iterator2, _Iterator3>::value &&
+      !__omp_gpu_backend::__omp_in_ptr_range(__first2, __first3, __first2 + __n);
+  if (are_not_same_type || (no_overlap_13 && no_overlap_23)) {
+#  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n], __first2[0 : __n])                 \
+      map(from : __first3[0 : __n]) device(__device)
+    for (_DifferenceType __i = 0; __i < __n; ++__i)
+      __f(__first1[__i], __first2[__i], __first3[__i]);
+    return __first1 + __n;
+  }
+  // In the general case, we have to map all data to and from the device
+#  pragma omp target teams distribute parallel for simd map(                                                           \
+          tofrom : __first1[0 : __n], __first2[0 : __n], __first3[0 : __n]) device(__device)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__first1[__i], __first2[__i], __first3[__i]);
+
+  return __first1 + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
+    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
+  __omp_parallel_for_simd_3(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __n,
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __omp_gpu_backend::__omp_extract_base_ptr(__first3),
+      __f);
+  return __first1 + __n;
+}
+
 } // namespace __omp_gpu_backend
 } // namespace __par_backend
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
new file mode 100644
index 000000000000000..03eba11a3f5f52b
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -0,0 +1,117 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/transform.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__type_traits/remove_cvref.h>
+#include <__utility/terminate_on_exception.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+    __gpu_backend_tag,
+    _ForwardIterator __first,
+    _ForwardIterator __last,
+    _ForwardOutIterator __result,
+    _UnaryOperation __op) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    return std::__par_backend::__parallel_for_simd_2(
+        __first,
+        __last - __first,
+        __result,
+        [&](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
+          __out_value = __op(__in_value);
+        });
+  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    std::__terminate_on_exception([&] {
+      std::__par_backend::__parallel_for(
+          __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+            return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op);
+          });
+    });
+    return __result + (__last - __first);
+  } else {
+    return std::transform(__first, __last, __result, __op);
+  }
+}
+
+template <class _ExecutionPolicy,
+          class _ForwardIterator1,
+          class _ForwardIterator2,
+          class _ForwardOutIterator,
+          class _BinaryOperation,
+          enable_if_t<is_execution_policy_v<__remove_cvref_t<_ExecutionPolicy>>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+    __gpu_backend_tag,
+    _ForwardIterator1 __first1,
+    _ForwardIterator1 __last1,
+    _ForwardIterator2 __first2,
+    _ForwardOutIterator __result,
+    _BinaryOperation __op) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    return std::__par_backend::__parallel_for_simd_3(
+        __first1,
+        __last1 - __first1,
+        __first2,
+        __result,
+        [&](__iter_reference<_ForwardIterator1> __in1,
+            __iter_reference<_ForwardIterator2> __in2,
+            __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); });
+  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    std::__terminate_on_exception([&] {
+      std::__par_backend::__parallel_for(
+          __first1,
+          __last1,
+          [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) {
+            return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{},
+                __brick_first,
+                __brick_last,
+                __first2 + (__brick_first - __first1),
+                __result + (__brick_first - __first1),
+                __op);
+          });
+    });
+    return __result + (__last1 - __first1);
+  } else {
+    return std::transform(__first1, __last1, __first2, __result, __op);
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H

>From 2b57d6c67a9c741ecfa3df3f8489045aa681c227 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 22 Sep 2023 11:55:53 -0700
Subject: [PATCH 05/46] Changing lambdas to capture by value in std::transform
 for GPUs

---
 .../__algorithm/pstl_backends/gpu_backends/transform.h    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index 03eba11a3f5f52b..7fcfde44aaaa7a6 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -38,11 +38,13 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    // While the CPU backend captures by reference, [&], that is not valid when
+    // offloading to the GPU. Therefore we must capture by value, [=].
     return std::__par_backend::__parallel_for_simd_2(
         __first,
         __last - __first,
         __result,
-        [&](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
+        [=](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
           __out_value = __op(__in_value);
         });
   } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
@@ -78,12 +80,14 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    // While the CPU backend captures by reference, [&], that is not valid when
+    // offloading to the GPU. Therefore we must capture by value, [=].
     return std::__par_backend::__parallel_for_simd_3(
         __first1,
         __last1 - __first1,
         __first2,
         __result,
-        [&](__iter_reference<_ForwardIterator1> __in1,
+        [=](__iter_reference<_ForwardIterator1> __in1,
             __iter_reference<_ForwardIterator2> __in2,
             __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); });
   } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&

>From 49896931e48d00f0b9df2d3f07014d4bcb9192f3 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 25 Sep 2023 13:13:39 -0700
Subject: [PATCH 06/46] GPU Offloading Implementation of std::transform_reduce

---
 libcxx/include/CMakeLists.txt                 |   1 +
 .../__algorithm/pstl_backends/gpu_backend.h   |   1 +
 .../pstl_backends/gpu_backends/omp_offload.h  | 113 ++++++++++++++
 .../gpu_backends/transform_reduce.h           | 147 ++++++++++++++++++
 4 files changed, 262 insertions(+)
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index f34928da1b82a39..115514cb31bf13a 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -91,6 +91,7 @@ set(files
   __algorithm/pstl_backends/gpu_backends/for_each.h
   __algorithm/pstl_backends/gpu_backends/omp_offload.h
   __algorithm/pstl_backends/gpu_backends/transform.h
+  __algorithm/pstl_backends/gpu_backends/transform_reduce.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index dac26592dac5c1f..ea7f39dea905474 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -17,6 +17,7 @@
 #  include <__algorithm/pstl_backends/gpu_backends/fill.h>
 #  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
 #  include <__algorithm/pstl_backends/gpu_backends/transform.h>
+#  include <__algorithm/pstl_backends/gpu_backends/transform_reduce.h>
 #endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 69221cbb8519233..d1cc6133f8e0876 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -191,6 +191,119 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
   return __first1 + __n;
 }
 
+//===----------------------------------------------------------------------===//
+// Templates for reductions
+//===----------------------------------------------------------------------===//
+
+// General case
+
+#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                            \
+    template <class _Iterator,                                                                                                   \
+              class _DifferenceType,                                                                                             \
+              typename _Tp,                                                                                                      \
+              typename _BinaryOperationType,                                                                                     \
+              typename _UnaryOperation,                                                                                          \
+              __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 >                                                                     \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                               \
+        _Iterator __first,                                                                                                       \
+        _DifferenceType __n,                                                                                                     \
+        _Tp __init,                                                                                                              \
+        std_op<_BinaryOperationType> __reduce,                                                                                   \
+        _UnaryOperation __transform,                                                                                             \
+        const int __device = 0) noexcept {                                                                                       \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n]) device(__device)) \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                            \
+        __init = __reduce(__init, __transform(__first[__i]));                                                                    \
+      return __init;                                                                                                             \
+    }
+
+#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                                                \
+    template <class _Iterator1,                                                                                                                      \
+              class _Iterator2,                                                                                                                      \
+              class _DifferenceType,                                                                                                                 \
+              typename _Tp,                                                                                                                          \
+              typename _BinaryOperationType,                                                                                                         \
+              typename _UnaryOperation,                                                                                                              \
+              __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 >                                                                                         \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                                                   \
+        _Iterator1 __first1,                                                                                                                         \
+        _Iterator2 __first2,                                                                                                                         \
+        _DifferenceType __n,                                                                                                                         \
+        _Tp __init,                                                                                                                                  \
+        std_op<_BinaryOperationType> __reduce,                                                                                                       \
+        _UnaryOperation __transform,                                                                                                                 \
+        const int __device = 0) noexcept {                                                                                                           \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]) device(__device)) \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                                \
+        __init = __reduce(__init, __transform(__first1[__i], __first2[__i]));                                                                        \
+      return __init;                                                                                                                                 \
+    }
+
+#  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
+    __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                        \
+    __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
+
+// Addition
+__PSTL_OMP_SIMD_REDUCTION(+, std::plus)
+
+// Subtraction
+__PSTL_OMP_SIMD_REDUCTION(-, std::minus)
+
+// Multiplication
+__PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
+
+// Logical and
+__PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
+
+// Logical or
+__PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
+
+// Bitwise and
+__PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
+
+// Bitwise or
+__PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
+
+// Bitwise xor
+__PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
+
+// Extracting the underlying pointers
+
+template <class _Iterator, class _DifferenceType, typename _Tp, typename _BinaryOperation, typename _UnaryOperation >
+_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
+    _Iterator __first,
+    _DifferenceType __n,
+    _Tp __init,
+    _BinaryOperation __reduce,
+    _UnaryOperation __transform,
+    const int __device = 0) noexcept {
+  return __omp_parallel_for_simd_reduction_1(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
+}
+
+template <class _Iterator1,
+          class _Iterator2,
+          class _DifferenceType,
+          typename _Tp,
+          typename _BinaryOperation,
+          typename _UnaryOperation >
+_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_2(
+    _Iterator1 __first1,
+    _Iterator2 __first2,
+    _DifferenceType __n,
+    _Tp __init,
+    _BinaryOperation __reduce,
+    _UnaryOperation __transform,
+    const int __device = 0) noexcept {
+  return __omp_parallel_for_simd_reduction_2(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __n,
+      __init,
+      __reduce,
+      __transform);
+}
+
 } // namespace __omp_gpu_backend
 } // namespace __par_backend
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
new file mode 100644
index 000000000000000..43e5631aef04afb
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -0,0 +1,147 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
+
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__numeric/transform_reduce.h>
+#include <__type_traits/is_arithmetic.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__type_traits/operation_traits.h>
+#include <__utility/move.h>
+#include <__utility/terminate_on_exception.h>
+#include <new>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+//===----------------------------------------------------------------------===//
+// Two input iterators
+//===----------------------------------------------------------------------===//
+
+template <class _ExecutionPolicy,
+          class _ForwardIterator1,
+          class _ForwardIterator2,
+          class _Tp,
+          class _BinaryOperation1,
+          class _BinaryOperation2>
+_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
+    __gpu_backend_tag,
+    _ForwardIterator1 __first1,
+    _ForwardIterator1 __last1,
+    _ForwardIterator2 __first2,
+    _Tp __init,
+    _BinaryOperation1 __reduce,
+    _BinaryOperation2 __transform) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
+    return std::__par_backend::__parallel_for_simd_reduction_2(
+        std::move(__first1),
+        std::move(__first2),
+        __last1 - __first1,
+        std::move(__init),
+        std::move(__reduce),
+        [=](__iter_reference<_ForwardIterator1> __in_value_1, __iter_reference<_ForwardIterator1> __in_value_2) {
+          return __transform(__in_value_1, __in_value_2);
+        });
+  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
+    return std::__terminate_on_exception([&] {
+      return __par_backend::__parallel_transform_reduce(
+          __first1,
+          std::move(__last1),
+          [__first1, __first2, __transform](_ForwardIterator1 __iter) {
+            return __transform(*__iter, *(__first2 + (__iter - __first1)));
+          },
+          std::move(__init),
+          std::move(__reduce),
+          [__first1, __first2, __reduce, __transform](
+              _ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last, _Tp __brick_init) {
+            return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{},
+                __brick_first,
+                std::move(__brick_last),
+                __first2 + (__brick_first - __first1),
+                std::move(__brick_init),
+                std::move(__reduce),
+                std::move(__transform));
+          });
+    });
+  } else {
+    return std::transform_reduce(
+        std::move(__first1),
+        std::move(__last1),
+        std::move(__first2),
+        std::move(__init),
+        std::move(__reduce),
+        std::move(__transform));
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// One input iterator
+//===----------------------------------------------------------------------===//
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
+_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
+    __gpu_backend_tag,
+    _ForwardIterator __first,
+    _ForwardIterator __last,
+    _Tp __init,
+    _BinaryOperation __reduce,
+    _UnaryOperation __transform) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    return std::__par_backend::__parallel_for_simd_reduction_1(
+        std::move(__first),
+        __last - __first,
+        std::move(__init),
+        std::move(__reduce),
+        [=](__iter_reference<_ForwardIterator> __in_value) { return __transform(__in_value); });
+  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    return std::__terminate_on_exception([&] {
+      return __par_backend::__parallel_transform_reduce(
+          std::move(__first),
+          std::move(__last),
+          [__transform](_ForwardIterator __iter) { return __transform(*__iter); },
+          std::move(__init),
+          __reduce,
+          [__transform, __reduce](auto __brick_first, auto __brick_last, _Tp __brick_init) {
+            return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{},
+                std::move(__brick_first),
+                std::move(__brick_last),
+                std::move(__brick_init),
+                std::move(__reduce),
+                std::move(__transform));
+          });
+    });
+  } else {
+    return std::transform_reduce(
+        std::move(__first), std::move(__last), std::move(__init), std::move(__reduce), std::move(__transform));
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H

>From 1870dd54eaa1fd8a0d78c99dfd9c9e6c58465fbd Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Tue, 26 Sep 2023 19:55:12 -0700
Subject: [PATCH 07/46] Fixed almost all test cases that failed during ninja
 check-cxx

---
 libcxx/include/CMakeLists.txt                 |   4 +
 .../__algorithm/pstl_backends/gpu_backend.h   |   4 +
 .../pstl_backends/gpu_backends/any_of.h       |  41 +++++++
 .../pstl_backends/gpu_backends/fill.h         |  20 +---
 .../pstl_backends/gpu_backends/find_if.h      |  44 ++++++++
 .../pstl_backends/gpu_backends/for_each.h     |  17 +--
 .../pstl_backends/gpu_backends/merge.h        |  51 +++++++++
 .../pstl_backends/gpu_backends/omp_offload.h  |  94 +++++++++-------
 .../pstl_backends/gpu_backends/stable_sort.h  |  38 +++++++
 .../pstl_backends/gpu_backends/transform.h    |  61 ++--------
 .../gpu_backends/transform_reduce.h           | 105 ++++++------------
 11 files changed, 292 insertions(+), 187 deletions(-)
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 115514cb31bf13a..cecde9c28f4f49d 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -86,10 +86,14 @@ set(files
   __algorithm/pstl_backends/cpu_backends/transform.h
   __algorithm/pstl_backends/cpu_backends/transform_reduce.h
   __algorithm/pstl_backends/gpu_backend.h
+  __algorithm/pstl_backends/gpu_backends/any_of.h
   __algorithm/pstl_backends/gpu_backends/backend.h
   __algorithm/pstl_backends/gpu_backends/fill.h
+  __algorithm/pstl_backends/gpu_backends/find_if.h
   __algorithm/pstl_backends/gpu_backends/for_each.h
+  __algorithm/pstl_backends/gpu_backends/merge.h
   __algorithm/pstl_backends/gpu_backends/omp_offload.h
+  __algorithm/pstl_backends/gpu_backends/stable_sort.h
   __algorithm/pstl_backends/gpu_backends/transform.h
   __algorithm/pstl_backends/gpu_backends/transform_reduce.h
   __algorithm/pstl_copy.h
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index ea7f39dea905474..f41332fbf9f6d42 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -14,8 +14,12 @@
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 
 #if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
+#  include <__algorithm/pstl_backends/gpu_backends/any_of.h>
 #  include <__algorithm/pstl_backends/gpu_backends/fill.h>
+#  include <__algorithm/pstl_backends/gpu_backends/find_if.h>
 #  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#  include <__algorithm/pstl_backends/gpu_backends/merge.h>
+#  include <__algorithm/pstl_backends/gpu_backends/stable_sort.h>
 #  include <__algorithm/pstl_backends/gpu_backends/transform.h>
 #  include <__algorithm/pstl_backends/gpu_backends/transform_reduce.h>
 #endif
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
new file mode 100644
index 000000000000000..8d911de55dcd685
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
+
+#include <__algorithm/any_of.h>
+#include <__algorithm/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__atomic/atomic.h>
+#include <__atomic/memory_order.h>
+#include <__config>
+#include <__functional/operations.h>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/pair.h>
+#include <__utility/terminate_on_exception.h>
+#include <cstdint>
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI bool
+__pstl_any_of(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+  // TODO: Implement GPU backend
+  return std::__pstl_any_of<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index 32926da87e2a083..8dc6bc6a6179c0e 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -14,6 +14,7 @@
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/terminate_on_exception.h>
 #include <stdio.h>
@@ -33,23 +34,12 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
   }
-  // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    std::__terminate_on_exception([&] {
-      __par_backend::__parallel_for(
-          __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
-            std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{}, __brick_first, __brick_last, __value);
-          });
-    });
-    // Else we execute for_each in serial
-  } else {
-    std::fill(__first, __last, __value);
-  }
+  // Otherwise, we execute for_each on the CPU instead
+  return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
new file mode 100644
index 000000000000000..2d34938f92dff38
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
+
+#include <__algorithm/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__atomic/atomic.h>
+#include <__config>
+#include <__functional/operations.h>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/pair.h>
+#include <__utility/terminate_on_exception.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI _ForwardIterator
+__pstl_find_if(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+  // TODO: Implement the GPU backend
+  return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index 14de2af8e4a15c6..23c8da27e64ae3b 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -33,23 +33,12 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
   // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    std::__terminate_on_exception([&] {
-      std::__par_backend::__parallel_for(
-          __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
-            std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{}, __brick_first, __brick_last, __func);
-          });
-    });
-    // Else we execute for_each in serial
-  } else {
-    std::for_each(__first, __last, __func);
-  }
+  return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
new file mode 100644
index 000000000000000..bc947ebb27ac7f0
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
+
+#include <__algorithm/merge.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/move.h>
+#include <__utility/terminate_on_exception.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy,
+          class _ForwardIterator1,
+          class _ForwardIterator2,
+          class _ForwardOutIterator,
+          class _Comp>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
+    __gpu_backend_tag,
+    _ForwardIterator1 __first1,
+    _ForwardIterator1 __last1,
+    _ForwardIterator2 __first2,
+    _ForwardIterator2 __last2,
+    _ForwardOutIterator __result,
+    _Comp __comp) {
+  // TODO: Implement GPU backend
+  return std::__pstl_merge<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first1, __last1, __first2, __last2, __result, __comp);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index d1cc6133f8e0876..36acafd448ec003 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -11,9 +11,19 @@
 
 #include <__assert>
 #include <__config>
+#include <__functional/operations.h>
+#include <__iterator/wrap_iter.h>
+#include <__memory/addressof.h>
+#include <__memory/pointer_traits.h>
+#include <__type_traits/is_pointer.h>
+#include <__type_traits/is_same.h>
 #include <__utility/move.h>
 #include <cstddef>
 
+// is_same
+
+// __libcpp_is_contiguous_iterator
+
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
@@ -30,27 +40,33 @@ inline namespace __omp_gpu_backend {
 
 // Checking if a pointer is in a range
 template <typename T1, typename T2, typename T3>
-_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1 a, T2 p, T3 b) {
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1, T2, T3) {
   return false;
 }
 
-template <typename T>
-_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T* a, T* p, T* b) {
-  return std::less_equal<T*>{}(a, p) && std::less<T*>{}(p, b);
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(_Tp* a, _Tp* p, _Tp* b) {
+  return std::less_equal<_Tp*>{}(a, p) && std::less<_Tp*>{}(p, b);
 }
 
 // In OpenMP, we need to extract the pointer for the underlying data for data
 // structures like std::vector and std::array to be able to map the data to the
 // device.
 
-template <typename T>
-_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(T p) {
+template <typename _Tp, std::enable_if<std::is_pointer<_Tp>::value >::type* = 0>
+_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(_Tp p) {
   return p;
 }
 
-template <typename T>
-_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
-  std::pointer_traits<std::__wrap_iter<T>> PT;
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) {
+  return std::addressof(*p);
+  ;
+}
+
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w) {
+  std::pointer_traits<std::__wrap_iter<_Tp>> PT;
   return PT.to_address(w);
 }
 
@@ -61,8 +77,8 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
 // Applying function or lambda in a loop
 
 template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator
-__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f, const int __device = 0) noexcept {
+_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(
+    _Iterator __first, _DifferenceType __n, _Function __f, [[maybe_unused]] const int __device = 0) noexcept {
 #  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
@@ -82,7 +98,7 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
 
 template <class _Index, class _DifferenceType, class _Tp>
 _LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
-    _Index __first, _DifferenceType __n, const _Tp& __value, const int __device = 0) noexcept {
+    _Index __first, _DifferenceType __n, const _Tp& __value, [[maybe_unused]] const int __device = 0) noexcept {
 #  pragma omp target teams distribute parallel for simd map(from : __first[0 : __n]) map(always, to : __value)         \
       device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
@@ -104,20 +120,24 @@ __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __valu
 
 template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
-    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f, const int __device = 0) noexcept {
+    _Iterator1 __first1,
+    _DifferenceType __n,
+    _Iterator2 __first2,
+    _Function __f,
+    [[maybe_unused]] const int __device = 0) noexcept {
   if ((!std::is_same<_Iterator1, _Iterator2>::value) ||
       (std::is_same<_Iterator1, _Iterator2>::value &&
        !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first2, __first1 + __n))) {
 #  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n]) map(from : __first2[0 : __n])      \
       device(__device)
     for (_DifferenceType __i = 0; __i < __n; ++__i)
-      __f(__first1[__i], __first2[__i]);
+      __first2[__i] = __f(__first1[__i]);
     return __first1 + __n;
   }
 #  pragma omp target teams distribute parallel for simd map(tofrom : __first1[0 : __n], __first2[0 : __n])             \
       device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(__first1[__i], __first2[__i]);
+    __first2[__i] = __f(__first1[__i]);
 
   return __first1 + __n;
 }
@@ -146,7 +166,7 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
     _Iterator2 __first2,
     _Iterator3 __first3,
     _Function __f,
-    const int __device = 0) noexcept {
+    [[maybe_unused]] const int __device = 0) noexcept {
   // It may be that __first3 is in the interval [__first1+__n) or [__firt2+__n)
   // It is, however, undefined behavior to compare two pointers that do not
   // point to the same object or are not the same type.
@@ -165,14 +185,14 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
 #  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n], __first2[0 : __n])                 \
       map(from : __first3[0 : __n]) device(__device)
     for (_DifferenceType __i = 0; __i < __n; ++__i)
-      __f(__first1[__i], __first2[__i], __first3[__i]);
+      __first3[__i] = __f(__first1[__i], __first2[__i]);
     return __first1 + __n;
   }
   // In the general case, we have to map all data to and from the device
 #  pragma omp target teams distribute parallel for simd map(                                                           \
           tofrom : __first1[0 : __n], __first2[0 : __n], __first3[0 : __n]) device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(__first1[__i], __first2[__i], __first3[__i]);
+    __first3[__i] = __f(__first1[__i], __first2[__i]);
 
   return __first1 + __n;
 }
@@ -197,46 +217,44 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
 
 // General case
 
-#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                            \
+#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                                 \
     template <class _Iterator,                                                                                                   \
               class _DifferenceType,                                                                                             \
               typename _Tp,                                                                                                      \
               typename _BinaryOperationType,                                                                                     \
-              typename _UnaryOperation,                                                                                          \
-              __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 >                                                                     \
+              typename _UnaryOperation>                                                                     \
     _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                               \
         _Iterator __first,                                                                                                       \
         _DifferenceType __n,                                                                                                     \
         _Tp __init,                                                                                                              \
         std_op<_BinaryOperationType> __reduce,                                                                                   \
-        _UnaryOperation __transform,                                                                                             \
-        const int __device = 0) noexcept {                                                                                       \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n]) device(__device)) \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                            \
-        __init = __reduce(__init, __transform(__first[__i]));                                                                    \
-      return __init;                                                                                                             \
+        _UnaryOperation __transform/*,                                                                                             \
+        [[maybe_unused]] const int __device = 0*/) noexcept {    \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n])) /*device(__device))*/ \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                 \
+        __init = __reduce(__init, __transform(__first[__i]));                                                                         \
+      return __init;                                                                                                                  \
     }
 
-#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                                                \
+#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                                                     \
     template <class _Iterator1,                                                                                                                      \
               class _Iterator2,                                                                                                                      \
               class _DifferenceType,                                                                                                                 \
               typename _Tp,                                                                                                                          \
               typename _BinaryOperationType,                                                                                                         \
-              typename _UnaryOperation,                                                                                                              \
-              __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 >                                                                                         \
+              typename _UnaryOperation >                                                                                         \
     _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                                                   \
         _Iterator1 __first1,                                                                                                                         \
         _Iterator2 __first2,                                                                                                                         \
         _DifferenceType __n,                                                                                                                         \
         _Tp __init,                                                                                                                                  \
         std_op<_BinaryOperationType> __reduce,                                                                                                       \
-        _UnaryOperation __transform,                                                                                                                 \
-        const int __device = 0) noexcept {                                                                                                           \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]) device(__device)) \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                                \
-        __init = __reduce(__init, __transform(__first1[__i], __first2[__i]));                                                                        \
-      return __init;                                                                                                                                 \
+        _UnaryOperation __transform/*,                                                                                                                 \
+        [[maybe_unused]] const int __device = 0*/) noexcept {    \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]))/* device(__device))*/ \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                                     \
+        __init = __reduce(__init, __transform(__first1[__i], __first2[__i]));                                                                             \
+      return __init;                                                                                                                                      \
     }
 
 #  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
@@ -276,7 +294,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
     _Tp __init,
     _BinaryOperation __reduce,
     _UnaryOperation __transform,
-    const int __device = 0) noexcept {
+    [[maybe_unused]] const int __device = 0) noexcept {
   return __omp_parallel_for_simd_reduction_1(
       __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
 }
@@ -294,7 +312,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_2(
     _Tp __init,
     _BinaryOperation __reduce,
     _UnaryOperation __transform,
-    const int __device = 0) noexcept {
+    [[maybe_unused]] const int __device = 0) noexcept {
   return __omp_parallel_for_simd_reduction_2(
       __omp_gpu_backend::__omp_extract_base_ptr(__first1),
       __omp_gpu_backend::__omp_extract_base_ptr(__first2),
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
new file mode 100644
index 000000000000000..1760a9fd9fc9d32
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
+
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/stable_sort.h>
+#include <__config>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_stable_sort(__gpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
+  // TODO: Implement GPU backend.
+  return __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index 7fcfde44aaaa7a6..10f6e5ff174d675 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -37,30 +37,13 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     _UnaryOperation __op) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     // While the CPU backend captures by reference, [&], that is not valid when
     // offloading to the GPU. Therefore we must capture by value, [=].
-    return std::__par_backend::__parallel_for_simd_2(
-        __first,
-        __last - __first,
-        __result,
-        [=](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
-          __out_value = __op(__in_value);
-        });
-  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
-                       __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    std::__terminate_on_exception([&] {
-      std::__par_backend::__parallel_for(
-          __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
-            return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op);
-          });
-    });
-    return __result + (__last - __first);
-  } else {
-    return std::transform(__first, __last, __result, __op);
+    return std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
   }
+  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
 }
 
 template <class _ExecutionPolicy,
@@ -79,39 +62,15 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     // While the CPU backend captures by reference, [&], that is not valid when
     // offloading to the GPU. Therefore we must capture by value, [=].
-    return std::__par_backend::__parallel_for_simd_3(
-        __first1,
-        __last1 - __first1,
-        __first2,
-        __result,
-        [=](__iter_reference<_ForwardIterator1> __in1,
-            __iter_reference<_ForwardIterator2> __in2,
-            __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); });
-  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
-                       __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    std::__terminate_on_exception([&] {
-      std::__par_backend::__parallel_for(
-          __first1,
-          __last1,
-          [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) {
-            return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{},
-                __brick_first,
-                __brick_last,
-                __first2 + (__brick_first - __first1),
-                __result + (__brick_first - __first1),
-                __op);
-          });
-    });
-    return __result + (__last1 - __first1);
-  } else {
-    return std::transform(__first1, __last1, __first2, __result, __op);
+    return std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
   }
+  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
index 43e5631aef04afb..8590dd3d024ea69 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -12,9 +12,11 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__config>
+#include <__functional/operations.h>
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
 #include <__numeric/transform_reduce.h>
+#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/operation_traits.h>
@@ -28,6 +30,25 @@
 
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+template <class _T1, class _T2, class _T3>
+struct __is_supported_reduction : std::false_type {};
+
+#  define __PSTL_IS_SUPPORTED_REDUCTION(funname)                                                                       \
+    template <class _Tp>                                                                                               \
+    struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};                                  \
+    template <class _Tp, class _Up>                                                                                    \
+    struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+
+// __is_trivial_plus_operation already exists
+__PSTL_IS_SUPPORTED_REDUCTION(plus)
+__PSTL_IS_SUPPORTED_REDUCTION(minus)
+__PSTL_IS_SUPPORTED_REDUCTION(multiplies)
+__PSTL_IS_SUPPORTED_REDUCTION(logical_and)
+__PSTL_IS_SUPPORTED_REDUCTION(logical_or)
+__PSTL_IS_SUPPORTED_REDUCTION(bit_and)
+__PSTL_IS_SUPPORTED_REDUCTION(bit_or)
+__PSTL_IS_SUPPORTED_REDUCTION(bit_xor)
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 //===----------------------------------------------------------------------===//
@@ -50,49 +71,16 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _BinaryOperation2 __transform) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+                (__is_trivial_plus_operation<_BinaryOperation1, _Tp, _Tp>::value ||
+                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
-        std::move(__first1),
-        std::move(__first2),
-        __last1 - __first1,
-        std::move(__init),
-        std::move(__reduce),
-        [=](__iter_reference<_ForwardIterator1> __in_value_1, __iter_reference<_ForwardIterator1> __in_value_2) {
-          return __transform(__in_value_1, __in_value_2);
-        });
-  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
-    return std::__terminate_on_exception([&] {
-      return __par_backend::__parallel_transform_reduce(
-          __first1,
-          std::move(__last1),
-          [__first1, __first2, __transform](_ForwardIterator1 __iter) {
-            return __transform(*__iter, *(__first2 + (__iter - __first1)));
-          },
-          std::move(__init),
-          std::move(__reduce),
-          [__first1, __first2, __reduce, __transform](
-              _ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last, _Tp __brick_init) {
-            return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{},
-                __brick_first,
-                std::move(__brick_last),
-                __first2 + (__brick_first - __first1),
-                std::move(__brick_init),
-                std::move(__reduce),
-                std::move(__transform));
-          });
-    });
-  } else {
-    return std::transform_reduce(
-        std::move(__first1),
-        std::move(__last1),
-        std::move(__first2),
-        std::move(__init),
-        std::move(__reduce),
-        std::move(__transform));
+        __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
   }
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first1, __last1, __first2, std::move(__init), __reduce, __transform);
 }
 
 //===----------------------------------------------------------------------===//
@@ -108,36 +96,15 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _BinaryOperation __reduce,
     _UnaryOperation __transform) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+                (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
+                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
-        std::move(__first),
-        __last - __first,
-        std::move(__init),
-        std::move(__reduce),
-        [=](__iter_reference<_ForwardIterator> __in_value) { return __transform(__in_value); });
-  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return std::__terminate_on_exception([&] {
-      return __par_backend::__parallel_transform_reduce(
-          std::move(__first),
-          std::move(__last),
-          [__transform](_ForwardIterator __iter) { return __transform(*__iter); },
-          std::move(__init),
-          __reduce,
-          [__transform, __reduce](auto __brick_first, auto __brick_last, _Tp __brick_init) {
-            return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
-                __cpu_backend_tag{},
-                std::move(__brick_first),
-                std::move(__brick_last),
-                std::move(__brick_init),
-                std::move(__reduce),
-                std::move(__transform));
-          });
-    });
-  } else {
-    return std::transform_reduce(
-        std::move(__first), std::move(__last), std::move(__init), std::move(__reduce), std::move(__transform));
+        __first, __last - __first, __init, __reduce, __transform);
   }
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
 }
 
 _LIBCPP_END_NAMESPACE_STD

>From 06d97c142b3d0c1e5ea8a580b3f5496919156d83 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Tue, 26 Sep 2023 20:15:33 -0700
Subject: [PATCH 08/46] Missing return statements in fill and for_each

---
 libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h    | 2 +-
 .../include/__algorithm/pstl_backends/gpu_backends/for_each.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index 8dc6bc6a6179c0e..d109495009df895 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -36,7 +36,7 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+    return std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
   }
   // Otherwise, we execute for_each on the CPU instead
   return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index 23c8da27e64ae3b..bab0c87de8f2fc7 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -35,7 +35,7 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+    return std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
   // Else if the excution policy is parallel, we execute for_each on the CPU instead
   return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);

>From 7f74f362cb37b65b98ca8c8c9aa559d8d1db03a4 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 27 Sep 2023 10:31:16 -0700
Subject: [PATCH 09/46] Passing all LIT tests

---
 .../__algorithm/pstl_backends/gpu_backends/fill.h | 10 ++++++----
 .../pstl_backends/gpu_backends/for_each.h         |  8 +++++---
 .../pstl_backends/gpu_backends/stable_sort.h      |  2 +-
 .../pstl_backends/gpu_backends/transform.h        | 15 ++++++++-------
 .../pstl_backends/gpu_backends/transform_reduce.h |  6 +++---
 5 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index d109495009df895..f32ee8b016b3eae 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -30,16 +30,18 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
-  // It is only safe to execute for_each on the GPU, it the execution policy is
+  // It is only safe to execute fill on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+    std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+  }
+  // Otherwise, we execute fill on the CPU instead
+  else {
+    std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
   }
-  // Otherwise, we execute for_each on the CPU instead
-  return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index bab0c87de8f2fc7..f96b30b5ba25b24 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -35,10 +35,12 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+    std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+  }
+  // Else we fall back to the GPU backend
+  else {
+    std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
   }
-  // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
index 1760a9fd9fc9d32..5cd7081ef73e9c1 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
@@ -28,7 +28,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_stable_sort(__gpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  return __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+  __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index 10f6e5ff174d675..c2e43cb6d643375 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -38,11 +38,12 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    // While the CPU backend captures by reference, [&], that is not valid when
-    // offloading to the GPU. Therefore we must capture by value, [=].
-    return std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
+    std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
+    return __result + (__last - __first);
   }
+  // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
 }
 
@@ -66,10 +67,10 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    // While the CPU backend captures by reference, [&], that is not valid when
-    // offloading to the GPU. Therefore we must capture by value, [=].
-    return std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
+    std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
+    return __result + (__last1 - __first1);
   }
+  // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
index 8590dd3d024ea69..332bb8abc1b8e0b 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -31,13 +31,13 @@
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 template <class _T1, class _T2, class _T3>
-struct __is_supported_reduction : std::false_type {};
+_LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction : std::false_type {};
 
 #  define __PSTL_IS_SUPPORTED_REDUCTION(funname)                                                                       \
     template <class _Tp>                                                                                               \
-    struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};                                  \
+    _LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};            \
     template <class _Tp, class _Up>                                                                                    \
-    struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+    _LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
 
 // __is_trivial_plus_operation already exists
 __PSTL_IS_SUPPORTED_REDUCTION(plus)

>From f6c633e14d910f0b2ddb3fcfed20d3057464b253 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Tue, 3 Oct 2023 20:58:11 -0700
Subject: [PATCH 10/46] Handling value iterators in PSTL GPU backend for C++ 20

---
 .../pstl_backends/gpu_backends/omp_offload.h  | 219 ++++++++++--------
 .../pstl_backends/gpu_backends/transform.h    |   6 +
 .../gpu_backends/transform_reduce.h           |  24 +-
 3 files changed, 138 insertions(+), 111 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 36acafd448ec003..81ec1bbc63d0083 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -12,6 +12,7 @@
 #include <__assert>
 #include <__config>
 #include <__functional/operations.h>
+#include <__iterator/iterator_traits.h>
 #include <__iterator/wrap_iter.h>
 #include <__memory/addressof.h>
 #include <__memory/pointer_traits.h>
@@ -38,36 +39,66 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
-// Checking if a pointer is in a range
-template <typename T1, typename T2, typename T3>
-_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1, T2, T3) {
-  return false;
+// Functions for eaxtracting the pase pointers
+
+// In the general case we do not need to extract it. This is for instance the
+// case for pointers.
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) noexcept {
+  return p;
 }
 
+// For vectors and arrays, etc, we need to extract the underlying base pointer.
 template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(_Tp* a, _Tp* p, _Tp* b) {
-  return std::less_equal<_Tp*>{}(a, p) && std::less<_Tp*>{}(p, b);
+_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w) noexcept {
+  std::pointer_traits<std::__wrap_iter<_Tp>> PT;
+  return PT.to_address(w);
 }
 
-// In OpenMP, we need to extract the pointer for the underlying data for data
-// structures like std::vector and std::array to be able to map the data to the
-// device.
+//===----------------------------------------------------------------------===//
+// The following four functions differentiates between contiguous iterators and
+// non-contiguous iterators. That allows to use the same implementations for
+// reference and value iterators
+//===----------------------------------------------------------------------===//
 
-template <typename _Tp, std::enable_if<std::is_pointer<_Tp>::value >::type* = 0>
-_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(_Tp p) {
-  return p;
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_to([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
+#  pragma omp target enter data map(to : p[0 : len])
+  } else {
+#  pragma omp target enter data map(to : p)
+  }
 }
 
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) {
-  return std::addressof(*p);
-  ;
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_from([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
+#  pragma omp target exit data map(from : p[0 : len])
+  } else {
+#  pragma omp target exit data map(release : p)
+  }
 }
 
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w) {
-  std::pointer_traits<std::__wrap_iter<_Tp>> PT;
-  return PT.to_address(w);
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_alloc([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
+#  pragma omp target enter data map(alloc : p[0 : len])
+  } else {
+#  pragma omp target enter data map(to : p)
+  }
+}
+
+template <class _Iterator, class _DifferenceType>
+_LIBCPP_HIDE_FROM_ABI void
+__omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
+#  pragma omp target exit data map(release : p[0 : len])
+  } else {
+#  pragma omp target exit data map(release : p)
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -79,9 +110,11 @@ _LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w)
 template <class _Iterator, class _DifferenceType, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(
     _Iterator __first, _DifferenceType __n, _Function __f, [[maybe_unused]] const int __device = 0) noexcept {
-#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) device(__device)
+  __omp_map_to(__first, __n);
+#  pragma omp target teams distribute parallel for simd device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
+  __omp_map_from(__first, __n);
 
   return __first + __n;
 }
@@ -99,11 +132,11 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
 template <class _Index, class _DifferenceType, class _Tp>
 _LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
     _Index __first, _DifferenceType __n, const _Tp& __value, [[maybe_unused]] const int __device = 0) noexcept {
-#  pragma omp target teams distribute parallel for simd map(from : __first[0 : __n]) map(always, to : __value)         \
-      device(__device)
+  __omp_map_alloc(__first, __n);
+#  pragma omp target teams distribute parallel for simd firstprivate(__value) device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __first[__i] = __value;
-
+  __omp_map_from(__first, __n);
   return __first + __n;
 }
 
@@ -125,20 +158,13 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
     _Iterator2 __first2,
     _Function __f,
     [[maybe_unused]] const int __device = 0) noexcept {
-  if ((!std::is_same<_Iterator1, _Iterator2>::value) ||
-      (std::is_same<_Iterator1, _Iterator2>::value &&
-       !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first2, __first1 + __n))) {
-#  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n]) map(from : __first2[0 : __n])      \
-      device(__device)
-    for (_DifferenceType __i = 0; __i < __n; ++__i)
-      __first2[__i] = __f(__first1[__i]);
-    return __first1 + __n;
-  }
-#  pragma omp target teams distribute parallel for simd map(tofrom : __first1[0 : __n], __first2[0 : __n])             \
-      device(__device)
+  __omp_map_alloc(__first2, __n);
+  __omp_map_to(__first1, __n);
+#  pragma omp target teams distribute parallel for simd device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __first2[__i] = __f(__first1[__i]);
-
+    *(__first2 + __i) = __f(*(__first1 + __i));
+  __omp_map_from(__first2, __n);
+  __omp_map_free(__first1, __n);
   return __first1 + __n;
 }
 
@@ -167,33 +193,15 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
     _Iterator3 __first3,
     _Function __f,
     [[maybe_unused]] const int __device = 0) noexcept {
-  // It may be that __first3 is in the interval [__first1+__n) or [__firt2+__n)
-  // It is, however, undefined behavior to compare two pointers that do not
-  // point to the same object or are not the same type.
-  // If we can prove that __first3 is not in any of the ranges [__first1+__n)
-  // or [__firt2+__n), it is safe to reduce the amount of data copied to and
-  // from the device
-  constexpr bool are_not_same_type =
-      !std::is_same<_Iterator1, _Iterator2>::value && !std::is_same<_Iterator1, _Iterator3>::value;
-  const bool no_overlap_13 =
-      std::is_same<_Iterator1, _Iterator3>::value &&
-      !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first3, __first1 + __n);
-  const bool no_overlap_23 =
-      std::is_same<_Iterator2, _Iterator3>::value &&
-      !__omp_gpu_backend::__omp_in_ptr_range(__first2, __first3, __first2 + __n);
-  if (are_not_same_type || (no_overlap_13 && no_overlap_23)) {
-#  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n], __first2[0 : __n])                 \
-      map(from : __first3[0 : __n]) device(__device)
-    for (_DifferenceType __i = 0; __i < __n; ++__i)
-      __first3[__i] = __f(__first1[__i], __first2[__i]);
-    return __first1 + __n;
-  }
-  // In the general case, we have to map all data to and from the device
-#  pragma omp target teams distribute parallel for simd map(                                                           \
-          tofrom : __first1[0 : __n], __first2[0 : __n], __first3[0 : __n]) device(__device)
+  __omp_map_to(__first1, __n);
+  __omp_map_to(__first2, __n);
+  __omp_map_alloc(__first3, __n);
+#  pragma omp target teams distribute parallel for simd device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __first3[__i] = __f(__first1[__i], __first2[__i]);
-
+    *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
+  __omp_map_free(__first1, __n);
+  __omp_map_free(__first2, __n);
+  __omp_map_from(__first3, __n);
   return __first1 + __n;
 }
 
@@ -215,47 +223,54 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
 // Templates for reductions
 //===----------------------------------------------------------------------===//
 
-// General case
-
-#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                                 \
-    template <class _Iterator,                                                                                                   \
-              class _DifferenceType,                                                                                             \
-              typename _Tp,                                                                                                      \
-              typename _BinaryOperationType,                                                                                     \
-              typename _UnaryOperation>                                                                     \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                               \
-        _Iterator __first,                                                                                                       \
-        _DifferenceType __n,                                                                                                     \
-        _Tp __init,                                                                                                              \
-        std_op<_BinaryOperationType> __reduce,                                                                                   \
-        _UnaryOperation __transform/*,                                                                                             \
-        [[maybe_unused]] const int __device = 0*/) noexcept {    \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n])) /*device(__device))*/ \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                 \
-        __init = __reduce(__init, __transform(__first[__i]));                                                                         \
-      return __init;                                                                                                                  \
+// In the two following function templates, we map the pointer to the device in
+// different ways depending on if they are contiguou or not.
+
+#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                  \
+    template <class _Iterator,                                                                                         \
+              class _DifferenceType,                                                                                   \
+              typename _Tp,                                                                                            \
+              typename _BinaryOperationType,                                                                           \
+              typename _UnaryOperation>                                                                                \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                     \
+        _Iterator __first,                                                                                             \
+        _DifferenceType __n,                                                                                           \
+        _Tp __init,                                                                                                    \
+        std_op<_BinaryOperationType> __reduce,                                                                         \
+        _UnaryOperation __transform,                                                                                   \
+        [[maybe_unused]] const int __device = 0) noexcept {                                                            \
+      __omp_map_to(__first, __n);                                                                                      \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) device(__device))                  \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
+        __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
+      __omp_map_free(__first, __n);                                                                                    \
+      return __init;                                                                                                   \
     }
 
-#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                                                     \
-    template <class _Iterator1,                                                                                                                      \
-              class _Iterator2,                                                                                                                      \
-              class _DifferenceType,                                                                                                                 \
-              typename _Tp,                                                                                                                          \
-              typename _BinaryOperationType,                                                                                                         \
-              typename _UnaryOperation >                                                                                         \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                                                   \
-        _Iterator1 __first1,                                                                                                                         \
-        _Iterator2 __first2,                                                                                                                         \
-        _DifferenceType __n,                                                                                                                         \
-        _Tp __init,                                                                                                                                  \
-        std_op<_BinaryOperationType> __reduce,                                                                                                       \
-        _UnaryOperation __transform/*,                                                                                                                 \
-        [[maybe_unused]] const int __device = 0*/) noexcept {    \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]))/* device(__device))*/ \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                                                     \
-        __init = __reduce(__init, __transform(__first1[__i], __first2[__i]));                                                                             \
-      return __init;                                                                                                                                      \
-    }
+#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                  \
+    template <class _Iterator1,                                                                                        \
+              class _Iterator2,                                                                                        \
+              class _DifferenceType,                                                                                   \
+              typename _Tp,                                                                                            \
+              typename _BinaryOperationType,                                                                           \
+              typename _UnaryOperation >                                                                               \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                     \
+        _Iterator1 __first1,                                                                                           \
+        _Iterator2 __first2,                                                                                           \
+        _DifferenceType __n,                                                                                           \
+        _Tp __init,                                                                                                    \
+        std_op<_BinaryOperationType> __reduce,                                                                         \
+        _UnaryOperation __transform,                                                                                   \
+        [[maybe_unused]] const int __device = 0) noexcept {                                                            \
+      __omp_map_to(__first1, __n);                                                                                     \
+      __omp_map_to(__first2, __n);                                                                                     \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) device(__device))                  \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
+        __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
+      __omp_map_free(__first1, __n);                                                                                   \
+      __omp_map_free(__first2, __n);                                                                                   \
+      return __init;                                                                                                   \
+    } // namespace __omp_gpu_backend
 
 #  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
     __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                        \
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index c2e43cb6d643375..3af2106eb2bda21 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -35,10 +35,13 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     _ForwardIterator __last,
     _ForwardOutIterator __result,
     _UnaryOperation __op) {
+  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
+#  if _LIBCPP_STD_VER <= 17
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+#  endif
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
@@ -60,12 +63,15 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     _ForwardIterator2 __first2,
     _ForwardOutIterator __result,
     _BinaryOperation __op) {
+  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
+#  if _LIBCPP_STD_VER <= 17
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
+#  endif
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
index 332bb8abc1b8e0b..eeffe62c040f083 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -30,14 +30,16 @@
 
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+_LIBCPP_BEGIN_NAMESPACE_STD
+
 template <class _T1, class _T2, class _T3>
-_LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction : std::false_type {};
+struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction : std::false_type {};
 
 #  define __PSTL_IS_SUPPORTED_REDUCTION(funname)                                                                       \
     template <class _Tp>                                                                                               \
-    _LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};            \
+    struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};            \
     template <class _Tp, class _Up>                                                                                    \
-    _LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+    struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
 
 // __is_trivial_plus_operation already exists
 __PSTL_IS_SUPPORTED_REDUCTION(plus)
@@ -49,8 +51,6 @@ __PSTL_IS_SUPPORTED_REDUCTION(bit_and)
 __PSTL_IS_SUPPORTED_REDUCTION(bit_or)
 __PSTL_IS_SUPPORTED_REDUCTION(bit_xor)
 
-_LIBCPP_BEGIN_NAMESPACE_STD
-
 //===----------------------------------------------------------------------===//
 // Two input iterators
 //===----------------------------------------------------------------------===//
@@ -69,11 +69,14 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _Tp __init,
     _BinaryOperation1 __reduce,
     _BinaryOperation2 __transform) {
+  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+#  if _LIBCPP_STD_VER <= 17
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
+#  endif
                 (__is_trivial_plus_operation<_BinaryOperation1, _Tp, _Tp>::value ||
                  __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
@@ -95,9 +98,12 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _Tp __init,
     _BinaryOperation __reduce,
     _UnaryOperation __transform) {
+  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+#  if _LIBCPP_STD_VER <= 17
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+#  endif
                 (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
                  __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_1(

>From 59308a131b6432766cb4ff665e13a3fe308a1e52 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 6 Oct 2023 11:42:25 -0700
Subject: [PATCH 11/46] Restructuring pstl_backends/gpu/ to
 pstl_backends/openmp

---
 libcxx/CMakeLists.txt                         | 36 ++++++++-----------
 libcxx/include/CMakeLists.txt                 | 22 ++++++------
 libcxx/include/__algorithm/pstl_backend.h     | 34 +++++++++++++-----
 .../__algorithm/pstl_backends/cpu_backend.h   | 23 +++++++-----
 .../pstl_backends/cpu_backends/backend.h      |  2 ++
 .../__algorithm/pstl_backends/gpu_backend.h   | 27 --------------
 .../__algorithm/pstl_backends/openmp.h        | 27 ++++++++++++++
 .../{gpu_backends => openmp}/any_of.h         | 13 ++++---
 .../{gpu_backends => openmp}/backend.h        | 16 +++------
 .../{gpu_backends => openmp}/fill.h           | 13 ++++---
 .../{gpu_backends => openmp}/find_if.h        | 13 ++++---
 .../{gpu_backends => openmp}/for_each.h       | 19 +++++-----
 .../{gpu_backends => openmp}/merge.h          | 14 ++++----
 .../{gpu_backends => openmp}/omp_offload.h    |  6 ++--
 .../{gpu_backends => openmp}/stable_sort.h    | 13 ++++---
 .../{gpu_backends => openmp}/transform.h      | 25 ++++++-------
 .../transform_reduce.h                        | 23 +++++-------
 libcxx/include/__config_site.in               |  3 +-
 18 files changed, 161 insertions(+), 168 deletions(-)
 delete mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backend.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/openmp.h
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/any_of.h (70%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/backend.h (58%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/fill.h (78%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/find_if.h (71%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/for_each.h (74%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/merge.h (73%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/omp_offload.h (98%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/stable_sort.h (68%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/transform.h (76%)
 rename libcxx/include/__algorithm/pstl_backends/{gpu_backends => openmp}/transform_reduce.h (85%)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index b29dc867ca1f2c7..24dd562035d24ed 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -294,19 +294,14 @@ option(LIBCXX_HAS_WIN32_THREAD_API "Ignore auto-detection and force use of win32
 option(LIBCXX_HAS_EXTERNAL_THREAD_API
   "Build libc++ with an externalized threading API.
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
-option(LIBCXX_ENABLE_GPU_OFFLOAD 
-  "Build libc++ with support for GPU offload" OFF)
 
-if (LIBCXX_ENABLE_THREADS)
-  set(LIBCXX_PSTL_CPU_BACKEND "std_thread" CACHE STRING "Which PSTL CPU backend to use")
-else()
-  set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
-endif()
-
-if (${LIBCXX_ENABLE_GPU_OFFLOAD})
-  set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
-else()
-  set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
+set(LIBCXX_PSTL_BACKEND "openmp" CACHE INTERNAL "Which PSTL backend to use")
+if (LIBCXX_PSTL_BACKEND STREQUAL "")
+  if (LIBCXX_ENABLE_THREADS)
+    set(LIBCXX_PSTL_BACKEND "std-thread")
+  else()
+    set(LIBCXX_PSTL_BACKEND "serial")
+  endif()
 endif()
 
 # Misc options ----------------------------------------------------------------
@@ -780,20 +775,17 @@ elseif (LIBCXX_HARDENING_MODE STREQUAL "unchecked")
   config_define(0 _LIBCPP_ENABLE_DEBUG_MODE_DEFAULT)
 endif()
 
-if (LIBCXX_PSTL_CPU_BACKEND STREQUAL "serial")
+if (LIBCXX_PSTL_BACKEND STREQUAL "serial")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_SERIAL)
-elseif(LIBCXX_PSTL_CPU_BACKEND STREQUAL "std_thread")
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "std-thread")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_THREAD)
-elseif(LIBCXX_PSTL_CPU_BACKEND STREQUAL "libdispatch")
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+  config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
 else()
-  message(FATAL_ERROR "LIBCXX_PSTL_CPU_BACKEND is set to ${LIBCXX_PSTL_CPU_BACKEND}, which is not a valid backend.
-                       Valid backends are: serial, std_thread and libdispatch")
-endif()
-
-config_define_if(LIBCXX_ENABLE_GPU_OFFLOAD _LIBCPP_PSTL_GPU_OFFLOAD)
-if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload")
-  config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+  message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
+                       Valid backends are: serial, std-thread, libdispatch, and openmp.")
 endif()
 
 if (LIBCXX_ABI_DEFINES)
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index cecde9c28f4f49d..64600d48acc4b8c 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -85,17 +85,17 @@ set(files
   __algorithm/pstl_backends/cpu_backends/thread.h
   __algorithm/pstl_backends/cpu_backends/transform.h
   __algorithm/pstl_backends/cpu_backends/transform_reduce.h
-  __algorithm/pstl_backends/gpu_backend.h
-  __algorithm/pstl_backends/gpu_backends/any_of.h
-  __algorithm/pstl_backends/gpu_backends/backend.h
-  __algorithm/pstl_backends/gpu_backends/fill.h
-  __algorithm/pstl_backends/gpu_backends/find_if.h
-  __algorithm/pstl_backends/gpu_backends/for_each.h
-  __algorithm/pstl_backends/gpu_backends/merge.h
-  __algorithm/pstl_backends/gpu_backends/omp_offload.h
-  __algorithm/pstl_backends/gpu_backends/stable_sort.h
-  __algorithm/pstl_backends/gpu_backends/transform.h
-  __algorithm/pstl_backends/gpu_backends/transform_reduce.h
+  __algorithm/pstl_backends/openmp.h
+  __algorithm/pstl_backends/openmp/any_of.h
+  __algorithm/pstl_backends/openmp/backend.h
+  __algorithm/pstl_backends/openmp/fill.h
+  __algorithm/pstl_backends/openmp/find_if.h
+  __algorithm/pstl_backends/openmp/for_each.h
+  __algorithm/pstl_backends/openmp/merge.h
+  __algorithm/pstl_backends/openmp/omp_offload.h
+  __algorithm/pstl_backends/openmp/stable_sort.h
+  __algorithm/pstl_backends/openmp/transform.h
+  __algorithm/pstl_backends/openmp/transform_reduce.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index e7e7244d5e9666d..b3f202765cd41ba 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -10,7 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKEND_H
 
 #include <__algorithm/pstl_backends/cpu_backend.h>
-#include <__algorithm/pstl_backends/gpu_backend.h>
+#include <__algorithm/pstl_backends/openmp.h>
 #include <__config>
 #include <execution>
 
@@ -192,6 +192,9 @@ frontend will turn that into a call to `std::__throw_bad_alloc();` to report the
 template <class _ExecutionPolicy>
 struct __select_backend;
 
+#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                          \
+      defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+
 template <>
 struct __select_backend<std::execution::sequenced_policy> {
   using type = __cpu_backend_tag;
@@ -204,25 +207,40 @@ struct __select_backend<std::execution::unsequenced_policy> {
 };
 #  endif
 
-#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                          \
-      defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 template <>
 struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
-#    if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
-  using type = __gpu_backend_tag;
+  using type = __cpu_backend_tag;
 };
-#    else
+
+#  elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+
 template <>
-struct __select_backend<std::execution::parallel_unsequenced_policy> {
-  using type = __cpu_backend_tag;
+struct __select_backend<std::execution::sequenced_policy> {
+  using type = __omp_backend_tag;
+};
+
+#    if _LIBCPP_STD_VER >= 20
+template <>
+struct __select_backend<std::execution::unsequenced_policy> {
+  using type = __omp_backend_tag;
 };
 #    endif
 
+template <>
+struct __select_backend<std::execution::parallel_policy> {
+  using type = __omp_backend_tag;
+};
+
+template <>
+struct __select_backend<std::execution::parallel_unsequenced_policy> {
+  using type = __omp_backend_tag;
+};
+
 #  else
 
 // ...New vendors can add parallel backends here...
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
index 6980ded189ea2a3..6de71ed702668bf 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
@@ -55,14 +55,19 @@ into a program termination at the front-end level. When a backend returns a dise
 frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user.
 */
 
-#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/cpu_backends/fill.h>
-#include <__algorithm/pstl_backends/cpu_backends/find_if.h>
-#include <__algorithm/pstl_backends/cpu_backends/for_each.h>
-#include <__algorithm/pstl_backends/cpu_backends/merge.h>
-#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
-#include <__algorithm/pstl_backends/cpu_backends/transform.h>
-#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
+#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                            \
+    defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+
+#  include <__algorithm/pstl_backends/cpu_backends/any_of.h>
+#  include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#  include <__algorithm/pstl_backends/cpu_backends/fill.h>
+#  include <__algorithm/pstl_backends/cpu_backends/find_if.h>
+#  include <__algorithm/pstl_backends/cpu_backends/for_each.h>
+#  include <__algorithm/pstl_backends/cpu_backends/merge.h>
+#  include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
+#  include <__algorithm/pstl_backends/cpu_backends/transform.h>
+#  include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
+
+#endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
index ea2210a4a7adbdb..b8e9b1e28201d16 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
@@ -18,6 +18,8 @@
 #  include <__algorithm/pstl_backends/cpu_backends/thread.h>
 #elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 #  include <__algorithm/pstl_backends/cpu_backends/libdispatch.h>
+#elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+// OpenMP covers both CPU and GPU backends
 #else
 #  error "Invalid CPU backend choice"
 #endif
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
deleted file mode 100644
index f41332fbf9f6d42..000000000000000
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ /dev/null
@@ -1,27 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
-
-#include <__config>
-
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
-
-#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
-#  include <__algorithm/pstl_backends/gpu_backends/any_of.h>
-#  include <__algorithm/pstl_backends/gpu_backends/fill.h>
-#  include <__algorithm/pstl_backends/gpu_backends/find_if.h>
-#  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
-#  include <__algorithm/pstl_backends/gpu_backends/merge.h>
-#  include <__algorithm/pstl_backends/gpu_backends/stable_sort.h>
-#  include <__algorithm/pstl_backends/gpu_backends/transform.h>
-#  include <__algorithm/pstl_backends/gpu_backends/transform_reduce.h>
-#endif
-
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
new file mode 100644
index 000000000000000..7787c82ff98825a
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+
+#include <__config>
+
+#include <__algorithm/pstl_backends/openmp/backend.h>
+
+#if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+#  include <__algorithm/pstl_backends/openmp/any_of.h>
+#  include <__algorithm/pstl_backends/openmp/fill.h>
+#  include <__algorithm/pstl_backends/openmp/find_if.h>
+#  include <__algorithm/pstl_backends/openmp/for_each.h>
+#  include <__algorithm/pstl_backends/openmp/merge.h>
+#  include <__algorithm/pstl_backends/openmp/stable_sort.h>
+#  include <__algorithm/pstl_backends/openmp/transform.h>
+#  include <__algorithm/pstl_backends/openmp/transform_reduce.h>
+#endif
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
similarity index 70%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index 8d911de55dcd685..2b9a88fc58edcdd 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -6,13 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_ANY_OF_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_ANY_OF_H
 
 #include <__algorithm/any_of.h>
 #include <__algorithm/find_if.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__atomic/atomic.h>
 #include <__atomic/memory_order.h>
 #include <__config>
@@ -29,13 +28,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI bool
-__pstl_any_of(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+__pstl_any_of(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement GPU backend
-  return std::__pstl_any_of<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+  return std::any_of(__first, __last, __pred);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_ANY_OF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
similarity index 58%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index a03ad35d8d2ae3e..e4e6136082a3427 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -6,19 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_BACKEND_H
 
 #include <__config>
 #include <cstddef>
 
-#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
-#  if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
-#    include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
-#  else
-#    error Invalid PSTL GPU backend
-#  endif
-#endif
+#include <__algorithm/pstl_backends/openmp/omp_offload.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -28,10 +22,10 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-struct __gpu_backend_tag {};
+struct __omp_backend_tag {};
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
similarity index 78%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index f32ee8b016b3eae..56341b04f0e50df 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FILL_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FILL_H
 
 #include <__algorithm/fill.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
@@ -29,7 +28,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI void
-__pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
+__pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   // It is only safe to execute fill on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
@@ -40,7 +39,7 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   }
   // Otherwise, we execute fill on the CPU instead
   else {
-    std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
+    std::fill(__first, __last, __value);
   }
 }
 
@@ -48,4 +47,4 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FILL_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
similarity index 71%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 2d34938f92dff38..1e78cb67cd75978 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FIND_IF_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FIND_IF_H
 
 #include <__algorithm/find_if.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__atomic/atomic.h>
 #include <__config>
 #include <__functional/operations.h>
@@ -32,13 +31,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI _ForwardIterator
-__pstl_find_if(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+__pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement the GPU backend
-  return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+  return std::find_if(__first, __last, __pred);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FIND_IF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
similarity index 74%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index f96b30b5ba25b24..401cdfade50a103 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FOR_EACH_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FOR_EACH_H
 
 #include <__algorithm/for_each.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
@@ -28,7 +27,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI void
-__pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
+__pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
   // It is only safe to execute for_each on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
@@ -36,10 +35,10 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
-  }
-  // Else we fall back to the GPU backend
-  else {
-    std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
+  } else {
+    // If it is not safe to offload to the GPU, we call the serial
+    // implementation
+    std::for_each(__first, __last, __func);
   }
 }
 
@@ -47,4 +46,4 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FOR_EACH_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
similarity index 73%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/merge.h
index bc947ebb27ac7f0..23036e1cc853980 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_MERGE_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_MERGE_H
 
 #include <__algorithm/merge.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
@@ -32,7 +31,7 @@ template <class _ExecutionPolicy,
           class _ForwardOutIterator,
           class _Comp>
 _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
     _ForwardIterator2 __first2,
@@ -40,12 +39,11 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
     _ForwardOutIterator __result,
     _Comp __comp) {
   // TODO: Implement GPU backend
-  return std::__pstl_merge<_ExecutionPolicy>(
-      __cpu_backend_tag{}, __first1, __last1, __first2, __last2, __result, __comp);
+  return std::merge(__first1, __last1, __first2, __last2, __result, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_MERGE_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
similarity index 98%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 81ec1bbc63d0083..ee0bfe13bac9010 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 
 #include <__assert>
 #include <__config>
@@ -346,4 +346,4 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
similarity index 68%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index 5cd7081ef73e9c1..7d28b8de77983a7 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
 
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/stable_sort.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
@@ -26,13 +25,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI void
-__pstl_stable_sort(__gpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
+__pstl_stable_sort(__omp_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+  std::stable_sort(__first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
similarity index 76%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index 3af2106eb2bda21..6dc7ab22c98cc35 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
 
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/transform.h>
 #include <__config>
 #include <__iterator/concepts.h>
@@ -30,7 +29,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
 _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator __first,
     _ForwardIterator __last,
     _ForwardOutIterator __result,
@@ -46,8 +45,8 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
   }
-  // If it is not safe to offload to the GPU, we rely on the CPU backend.
-  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
+  // If it is not safe to offload to the GPU, we rely on the serial backend.
+  return std::transform(__first, __last, __result, __op);
 }
 
 template <class _ExecutionPolicy,
@@ -57,7 +56,7 @@ template <class _ExecutionPolicy,
           class _BinaryOperation,
           enable_if_t<is_execution_policy_v<__remove_cvref_t<_ExecutionPolicy>>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
     _ForwardIterator2 __first2,
@@ -68,20 +67,16 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
-#  if _LIBCPP_STD_VER <= 17
-                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
-#  endif
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
   }
-  // If it is not safe to offload to the GPU, we rely on the CPU backend.
-  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
+  // If it is not safe to offload to the GPU, we call the serial implementation
+  return std::transform(__first1, __last1, __first2, __result, __op);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
similarity index 85%
rename from libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
rename to libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index eeffe62c040f083..c63e456b314c1c8 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
 
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/concepts.h>
@@ -62,7 +61,7 @@ template <class _ExecutionPolicy,
           class _BinaryOperation1,
           class _BinaryOperation2>
 _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
     _ForwardIterator2 __first2,
@@ -82,8 +81,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
   }
-  return std::__pstl_transform_reduce<_ExecutionPolicy>(
-      __cpu_backend_tag{}, __first1, __last1, __first2, std::move(__init), __reduce, __transform);
+  return std::transform_reduce(__first1, __last1, __first2, std::move(__init), __reduce, __transform);
 }
 
 //===----------------------------------------------------------------------===//
@@ -92,7 +90,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
 _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
-    __gpu_backend_tag,
+    __omp_backend_tag,
     _ForwardIterator __first,
     _ForwardIterator __last,
     _Tp __init,
@@ -101,20 +99,17 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
   // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
-#  if _LIBCPP_STD_VER <= 17
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
-#  endif
                 (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
                  __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);
   }
-  return std::__pstl_transform_reduce<_ExecutionPolicy>(
-      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
+  // If it is not safe to offload to the GPU, we call the serial implementation
+  return std::transform_reduce(__first, __last, std::move(__init), __reduce, __transform);
 }
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index e7fb4f423079333..21334e797b89632 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -34,8 +34,7 @@
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_SERIAL
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
-#cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD
-#cmakedefine _LIBCPP_PSTL_GPU_OFFLOAD
+#cmakedefine _LIBCPP_PSTL_BACKEND_OPENMP
 
 // Hardening.
 #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT

>From 067d47406881c7b4ca0e1f421e3b6dfd7590ed64 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 6 Oct 2023 12:39:29 -0700
Subject: [PATCH 12/46] Configured PSTL OpenMP backend to use the serial CPU
 backend when it is not safe to offload

---
 libcxx/include/__algorithm/pstl_backend.h      | 18 +++---------------
 .../__algorithm/pstl_backends/cpu_backend.h    |  5 -----
 .../pstl_backends/cpu_backends/backend.h       |  4 +---
 .../include/__algorithm/pstl_backends/openmp.h |  6 ++----
 .../__algorithm/pstl_backends/openmp/any_of.h  |  3 ++-
 .../__algorithm/pstl_backends/openmp/fill.h    |  3 ++-
 .../__algorithm/pstl_backends/openmp/find_if.h |  3 ++-
 .../pstl_backends/openmp/for_each.h            |  9 +++++----
 .../__algorithm/pstl_backends/openmp/merge.h   |  4 +++-
 .../pstl_backends/openmp/stable_sort.h         |  3 ++-
 .../pstl_backends/openmp/transform.h           | 12 +++++-------
 .../pstl_backends/openmp/transform_reduce.h    | 12 +++++-------
 12 files changed, 32 insertions(+), 50 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index b3f202765cd41ba..dca9a29ce25d54e 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -192,9 +192,6 @@ frontend will turn that into a call to `std::__throw_bad_alloc();` to report the
 template <class _ExecutionPolicy>
 struct __select_backend;
 
-#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                          \
-      defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
-
 template <>
 struct __select_backend<std::execution::sequenced_policy> {
   using type = __cpu_backend_tag;
@@ -207,6 +204,9 @@ struct __select_backend<std::execution::unsequenced_policy> {
 };
 #  endif
 
+#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                          \
+      defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+
 template <>
 struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
@@ -219,18 +219,6 @@ struct __select_backend<std::execution::parallel_unsequenced_policy> {
 
 #  elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 
-template <>
-struct __select_backend<std::execution::sequenced_policy> {
-  using type = __omp_backend_tag;
-};
-
-#    if _LIBCPP_STD_VER >= 20
-template <>
-struct __select_backend<std::execution::unsequenced_policy> {
-  using type = __omp_backend_tag;
-};
-#    endif
-
 template <>
 struct __select_backend<std::execution::parallel_policy> {
   using type = __omp_backend_tag;
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
index 6de71ed702668bf..5e9781d9bc59c12 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
@@ -55,9 +55,6 @@ into a program termination at the front-end level. When a backend returns a dise
 frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user.
 */
 
-#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                            \
-    defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
-
 #  include <__algorithm/pstl_backends/cpu_backends/any_of.h>
 #  include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #  include <__algorithm/pstl_backends/cpu_backends/fill.h>
@@ -68,6 +65,4 @@ frontend will turn that into a call to `std::__throw_bad_alloc();` to report the
 #  include <__algorithm/pstl_backends/cpu_backends/transform.h>
 #  include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
 
-#endif
-
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
index b8e9b1e28201d16..51aa878d734514d 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
@@ -12,14 +12,12 @@
 #include <__config>
 #include <cstddef>
 
-#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL)
+#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 #  include <__algorithm/pstl_backends/cpu_backends/serial.h>
 #elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD)
 #  include <__algorithm/pstl_backends/cpu_backends/thread.h>
 #elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 #  include <__algorithm/pstl_backends/cpu_backends/libdispatch.h>
-#elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
-// OpenMP covers both CPU and GPU backends
 #else
 #  error "Invalid CPU backend choice"
 #endif
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index 7787c82ff98825a..4c121e187ae246c 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -9,11 +9,10 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
 
-#include <__config>
+#  include <__config>
 
-#include <__algorithm/pstl_backends/openmp/backend.h>
+#  include <__algorithm/pstl_backends/openmp/backend.h>
 
-#if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 #  include <__algorithm/pstl_backends/openmp/any_of.h>
 #  include <__algorithm/pstl_backends/openmp/fill.h>
 #  include <__algorithm/pstl_backends/openmp/find_if.h>
@@ -22,6 +21,5 @@
 #  include <__algorithm/pstl_backends/openmp/stable_sort.h>
 #  include <__algorithm/pstl_backends/openmp/transform.h>
 #  include <__algorithm/pstl_backends/openmp/transform_reduce.h>
-#endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index 2b9a88fc58edcdd..a36f545d5bd719b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -11,6 +11,7 @@
 
 #include <__algorithm/any_of.h>
 #include <__algorithm/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__atomic/atomic.h>
 #include <__atomic/memory_order.h>
@@ -30,7 +31,7 @@ template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI bool
 __pstl_any_of(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement GPU backend
-  return std::any_of(__first, __last, __pred);
+  return std::__pstl_any_of<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 56341b04f0e50df..6098769a4fedb2d 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FILL_H
 
 #include <__algorithm/fill.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
@@ -39,7 +40,7 @@ __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   }
   // Otherwise, we execute fill on the CPU instead
   else {
-    std::fill(__first, __last, __value);
+    std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
   }
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 1e78cb67cd75978..6cd3b808e0700cc 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FIND_IF_H
 
 #include <__algorithm/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__atomic/atomic.h>
 #include <__config>
@@ -33,7 +34,7 @@ template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI _ForwardIterator
 __pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement the GPU backend
-  return std::find_if(__first, __last, __pred);
+  return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index 401cdfade50a103..c1a159829989bb1 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_FOR_EACH_H
 
 #include <__algorithm/for_each.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
@@ -35,10 +36,10 @@ __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
-  } else {
-    // If it is not safe to offload to the GPU, we call the serial
-    // implementation
-    std::for_each(__first, __last, __func);
+  }
+  // Else we fall back to the serial backend
+  else {
+    std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
   }
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
index 23036e1cc853980..b6587b161d15488 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_MERGE_H
 
 #include <__algorithm/merge.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
@@ -39,7 +40,8 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
     _ForwardOutIterator __result,
     _Comp __comp) {
   // TODO: Implement GPU backend
-  return std::merge(__first1, __last1, __first2, __last2, __result, __comp);
+  return std::__pstl_merge<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first1, __last1, __first2, __last2, __result, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index 7d28b8de77983a7..ac9323b04f63f90 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_STABLE_SORT_H
 
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/stable_sort.h>
 #include <__config>
@@ -27,7 +28,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_stable_sort(__omp_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  std::stable_sort(__first, __last, __comp);
+  __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index 6dc7ab22c98cc35..f855c5146e0f39b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_H
 
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/transform.h>
 #include <__config>
@@ -38,15 +39,12 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
-#  if _LIBCPP_STD_VER <= 17
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
-#  endif
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
   }
-  // If it is not safe to offload to the GPU, we rely on the serial backend.
-  return std::transform(__first, __last, __result, __op);
+  // If it is not safe to offload to the GPU, we rely on the CPU backend.
+  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
 }
 
 template <class _ExecutionPolicy,
@@ -71,8 +69,8 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
     std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
   }
-  // If it is not safe to offload to the GPU, we call the serial implementation
-  return std::transform(__first1, __last1, __first2, __result, __op);
+  // If it is not safe to offload to the GPU, we rely on the CPU backend.
+  return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index c63e456b314c1c8..b34d2be79921a71 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_TRANSFORM_REDUCE_H
 
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__functional/operations.h>
@@ -72,16 +73,13 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
-#  if _LIBCPP_STD_VER <= 17
-                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
-#  endif
                 (__is_trivial_plus_operation<_BinaryOperation1, _Tp, _Tp>::value ||
                  __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
   }
-  return std::transform_reduce(__first1, __last1, __first2, std::move(__init), __reduce, __transform);
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first1, __last1, __first2, std::move(__init), __reduce, __transform);
 }
 
 //===----------------------------------------------------------------------===//
@@ -104,8 +102,8 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);
   }
-  // If it is not safe to offload to the GPU, we call the serial implementation
-  return std::transform_reduce(__first, __last, std::move(__init), __reduce, __transform);
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
 }
 
 _LIBCPP_END_NAMESPACE_STD

>From 0b7fc361e31c2d9ed980734fd6115e4196d3a916 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 9 Oct 2023 11:48:58 -0700
Subject: [PATCH 13/46] Clang-formatted cpu_backend.h and openmp.h

---
 .../__algorithm/pstl_backends/cpu_backend.h   | 18 ++++++++---------
 .../__algorithm/pstl_backends/openmp.h        | 20 +++++++++----------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
index 5e9781d9bc59c12..6980ded189ea2a3 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
@@ -55,14 +55,14 @@ into a program termination at the front-end level. When a backend returns a dise
 frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user.
 */
 
-#  include <__algorithm/pstl_backends/cpu_backends/any_of.h>
-#  include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#  include <__algorithm/pstl_backends/cpu_backends/fill.h>
-#  include <__algorithm/pstl_backends/cpu_backends/find_if.h>
-#  include <__algorithm/pstl_backends/cpu_backends/for_each.h>
-#  include <__algorithm/pstl_backends/cpu_backends/merge.h>
-#  include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
-#  include <__algorithm/pstl_backends/cpu_backends/transform.h>
-#  include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
+#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/cpu_backends/fill.h>
+#include <__algorithm/pstl_backends/cpu_backends/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/for_each.h>
+#include <__algorithm/pstl_backends/cpu_backends/merge.h>
+#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
+#include <__algorithm/pstl_backends/cpu_backends/transform.h>
+#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index 4c121e187ae246c..6afc43f1faf3567 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -9,17 +9,17 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
 
-#  include <__config>
+#include <__config>
 
-#  include <__algorithm/pstl_backends/openmp/backend.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 
-#  include <__algorithm/pstl_backends/openmp/any_of.h>
-#  include <__algorithm/pstl_backends/openmp/fill.h>
-#  include <__algorithm/pstl_backends/openmp/find_if.h>
-#  include <__algorithm/pstl_backends/openmp/for_each.h>
-#  include <__algorithm/pstl_backends/openmp/merge.h>
-#  include <__algorithm/pstl_backends/openmp/stable_sort.h>
-#  include <__algorithm/pstl_backends/openmp/transform.h>
-#  include <__algorithm/pstl_backends/openmp/transform_reduce.h>
+#include <__algorithm/pstl_backends/openmp/any_of.h>
+#include <__algorithm/pstl_backends/openmp/fill.h>
+#include <__algorithm/pstl_backends/openmp/find_if.h>
+#include <__algorithm/pstl_backends/openmp/for_each.h>
+#include <__algorithm/pstl_backends/openmp/merge.h>
+#include <__algorithm/pstl_backends/openmp/stable_sort.h>
+#include <__algorithm/pstl_backends/openmp/transform.h>
+#include <__algorithm/pstl_backends/openmp/transform_reduce.h>
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H

>From 42ff4edbdf934c00b91396eeacd2587ee848cbcf Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 9 Oct 2023 17:08:00 -0700
Subject: [PATCH 14/46] Removing unnecessary includes and redundant iterator
 checks

---
 libcxx/include/__algorithm/pstl_backend.h       |  2 +-
 .../include/__algorithm/pstl_backends/openmp.h  |  6 +++---
 .../__algorithm/pstl_backends/openmp/any_of.h   |  8 --------
 .../__algorithm/pstl_backends/openmp/fill.h     |  9 ++-------
 .../__algorithm/pstl_backends/openmp/find_if.h  |  8 --------
 .../__algorithm/pstl_backends/openmp/for_each.h |  4 ----
 .../__algorithm/pstl_backends/openmp/merge.h    |  4 ----
 .../pstl_backends/openmp/stable_sort.h          |  2 --
 .../pstl_backends/openmp/transform.h            |  6 ------
 .../pstl_backends/openmp/transform_reduce.h     | 17 +++++------------
 10 files changed, 11 insertions(+), 55 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index dca9a29ce25d54e..e2705f2f9c81544 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -221,7 +221,7 @@ struct __select_backend<std::execution::parallel_unsequenced_policy> {
 
 template <>
 struct __select_backend<std::execution::parallel_policy> {
-  using type = __omp_backend_tag;
+  using type = __cpu_backend_tag;
 };
 
 template <>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index 6afc43f1faf3567..9e5490db655f4bd 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_H
 
 #include <__config>
 
@@ -22,4 +22,4 @@
 #include <__algorithm/pstl_backends/openmp/transform.h>
 #include <__algorithm/pstl_backends/openmp/transform_reduce.h>
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index a36f545d5bd719b..beae485f41f248d 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -10,18 +10,10 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_ANY_OF_H
 
 #include <__algorithm/any_of.h>
-#include <__algorithm/find_if.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
-#include <__atomic/atomic.h>
-#include <__atomic/memory_order.h>
 #include <__config>
-#include <__functional/operations.h>
-#include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__utility/pair.h>
-#include <__utility/terminate_on_exception.h>
-#include <cstdint>
 
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 6098769a4fedb2d..19a3ba3f7be07bd 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -13,11 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
-#include <__iterator/concepts.h>
-#include <__iterator/iterator_traits.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__utility/terminate_on_exception.h>
-#include <stdio.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -31,10 +27,9 @@ template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   // It is only safe to execute fill on the GPU, it the execution policy is
-  // parallel unsequenced, as it is the only execution policy prohibiting throwing
-  // exceptions and allowing SIMD instructions
+  // parallel unsequenced, as it is the only execution policy allowing
+  // SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 6cd3b808e0700cc..4526d3976567a07 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -12,15 +12,7 @@
 #include <__algorithm/find_if.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
-#include <__atomic/atomic.h>
 #include <__config>
-#include <__functional/operations.h>
-#include <__iterator/concepts.h>
-#include <__iterator/iterator_traits.h>
-#include <__type_traits/is_execution_policy.h>
-#include <__utility/pair.h>
-#include <__utility/terminate_on_exception.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index c1a159829989bb1..30c2e5cca2f2649 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -13,10 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
-#include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__utility/terminate_on_exception.h>
-#include <stdio.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -33,7 +30,6 @@ __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
index b6587b161d15488..666c3a0e0001933 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
@@ -13,10 +13,6 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
-#include <__iterator/concepts.h>
-#include <__type_traits/is_execution_policy.h>
-#include <__utility/move.h>
-#include <__utility/terminate_on_exception.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index ac9323b04f63f90..46e48dd279e89bd 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -13,8 +13,6 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/stable_sort.h>
 #include <__config>
-#include <__type_traits/is_execution_policy.h>
-#include <__utility/terminate_on_exception.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index f855c5146e0f39b..e507a16eac965b0 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -14,11 +14,7 @@
 #include <__algorithm/transform.h>
 #include <__config>
 #include <__iterator/concepts.h>
-#include <__iterator/iterator_traits.h>
-#include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__type_traits/remove_cvref.h>
-#include <__utility/terminate_on_exception.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -38,7 +34,6 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
@@ -64,7 +59,6 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
     std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index b34d2be79921a71..eb0a7ba52d8fdcc 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -12,17 +12,12 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
-#include <__functional/operations.h>
 #include <__iterator/concepts.h>
-#include <__iterator/iterator_traits.h>
 #include <__numeric/transform_reduce.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/operation_traits.h>
-#include <__utility/move.h>
-#include <__utility/terminate_on_exception.h>
-#include <new>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -33,13 +28,13 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _T1, class _T2, class _T3>
-struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction : std::false_type {};
+struct __is_supported_reduction : std::false_type {};
 
 #  define __PSTL_IS_SUPPORTED_REDUCTION(funname)                                                                       \
     template <class _Tp>                                                                                               \
-    struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};            \
+    struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {};                                  \
     template <class _Tp, class _Up>                                                                                    \
-    struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+    struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
 
 // __is_trivial_plus_operation already exists
 __PSTL_IS_SUPPORTED_REDUCTION(plus)
@@ -73,8 +68,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
-                (__is_trivial_plus_operation<_BinaryOperation1, _Tp, _Tp>::value ||
-                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)) {
+                __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
   }
@@ -97,8 +91,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
   // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
-                (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
-                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)) {
+                __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);
   }

>From 7597f19695fa7e21a66d83bbb55913e09c70a262 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 16 Oct 2023 01:06:48 -0700
Subject: [PATCH 15/46] Adding optional to OpenMP backend and other
 refactorings requested during code review

---
 .../__algorithm/pstl_backends/openmp/any_of.h |   3 +-
 .../__algorithm/pstl_backends/openmp/fill.h   |  32 ++-
 .../pstl_backends/openmp/find_if.h            |  39 +++-
 .../pstl_backends/openmp/for_each.h           |  33 +++-
 .../__algorithm/pstl_backends/openmp/merge.h  |   3 +-
 .../pstl_backends/openmp/omp_offload.h        | 185 +++---------------
 .../pstl_backends/openmp/stable_sort.h        |   6 +-
 .../pstl_backends/openmp/transform.h          |  93 ++++++++-
 .../pstl_backends/openmp/transform_reduce.h   |  16 +-
 9 files changed, 225 insertions(+), 185 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index beae485f41f248d..65f2294ff2ee5fe 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -14,13 +14,14 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI bool
+_LIBCPP_HIDE_FROM_ABI optional<bool>
 __pstl_any_of(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   // TODO: Implement GPU backend
   return std::__pstl_any_of<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 19a3ba3f7be07bd..250e514b4d526fd 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -14,6 +14,8 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
+#include <__utility/empty.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -23,19 +25,43 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+template <class _Tp, class _DifferenceType, class _Up>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd_val(
+    _Tp* __first, _DifferenceType __n, const _Up& __value) noexcept {
+  __omp_gpu_backend::__omp_map_alloc(__first, __n);
+#  pragma omp target teams distribute parallel for simd firstprivate(__value)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__first+__i) = __value;
+  __omp_gpu_backend::__omp_map_from(__first, __n);
+  return __empty{};
+}
+
+template <class _ForwardIterator, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__parallel_for_simd_val(_ForwardIterator __first, _DifferenceType __n, const _Tp& __value) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd_val(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
+}
+
+}
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
-_LIBCPP_HIDE_FROM_ABI void
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   // It is only safe to execute fill on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy allowing
   // SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+    return std::__par_backend::__parallel_for_simd_val(__first, __last - __first, __value);
   }
   // Otherwise, we execute fill on the CPU instead
   else {
-    std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
+    return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
   }
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 4526d3976567a07..94f7c42953c2a4a 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -22,11 +23,43 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+template <class _Tp, class _DifferenceType, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI _DifferenceType  __omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
+  __omp_gpu_backend::__omp_map_to(__first, __n);
+  _DifferenceType idx = __n;
+#  pragma omp target teams distribute parallel for simd reduction(min:idx)
+  for (_DifferenceType __i = 0; __i < __n; ++__i){
+    if (__pred(*(__first+__i))) {
+      idx = (__i < idx) ? __i : idx;
+    }
+  }
+  __omp_gpu_backend::__omp_map_free(__first, __n);
+  return idx;
+}
+
+// Extracting the underlying pointer
+
+template <class _ForwardIterator, class _DifferenceType, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator> __parallel_for_min_idx(_ForwardIterator __first, _DifferenceType __n, _Predicate __pred) noexcept {
+  return __first + __omp_gpu_backend::__omp_parallel_for_min_idx(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __pred);
+}
+
+}
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI _ForwardIterator
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
 __pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
-  // TODO: Implement the GPU backend
-  return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
+    return __par_backend::__parallel_for_min_idx(__first, __last - __first, __pred);
+  } else {
+    return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+  }
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index 30c2e5cca2f2649..444352b4a5e51a8 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -14,6 +14,8 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
+#include <optional>
+#include <__utility/empty.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -23,19 +25,44 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+template <class _Tp, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
+    _Tp* __first, _DifferenceType __n, _Function __f) noexcept {
+  __omp_gpu_backend::__omp_map_to(__first, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(*(__first+__i));
+  __omp_gpu_backend::__omp_map_from(__first, __n);
+  return __empty{};
+}
+
+// Extracting the underlying pointer
+
+template <class _ForwardIterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd_1(_ForwardIterator __first, _DifferenceType __n, _Function __f) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
+}
+
+}
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
-_LIBCPP_HIDE_FROM_ABI void
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
   // It is only safe to execute for_each on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+    return std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
   // Else we fall back to the serial backend
   else {
-    std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
+    return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
   }
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
index 666c3a0e0001933..8fff9125add191d 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/merge.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -27,7 +28,7 @@ template <class _ExecutionPolicy,
           class _ForwardIterator2,
           class _ForwardOutIterator,
           class _Comp>
-_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_merge(
     __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index ee0bfe13bac9010..77afb35ee71b4bb 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 
 #include <__assert>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/iterator_traits.h>
@@ -19,7 +20,9 @@
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
 #include <__utility/move.h>
+#include <__utility/empty.h>
 #include <cstddef>
+#include <optional>
 
 // is_same
 
@@ -39,13 +42,15 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
+//===----------------------------------------------------------------------===//
 // Functions for eaxtracting the pase pointers
+//===----------------------------------------------------------------------===//
 
 // In the general case we do not need to extract it. This is for instance the
 // case for pointers.
 template <typename _Tp>
 _LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) noexcept {
-  return p;
+  return std::__unwrap_iter(p);
 }
 
 // For vectors and arrays, etc, we need to extract the underlying base pointer.
@@ -64,159 +69,29 @@ _LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w)
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_to([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
-  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
-#  pragma omp target enter data map(to : p[0 : len])
-  } else {
-#  pragma omp target enter data map(to : p)
-  }
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target enter data map(to : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_from([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
-  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
-#  pragma omp target exit data map(from : p[0 : len])
-  } else {
-#  pragma omp target exit data map(release : p)
-  }
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target exit data map(from : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_alloc([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
-  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
-#  pragma omp target enter data map(alloc : p[0 : len])
-  } else {
-#  pragma omp target enter data map(to : p)
-  }
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target enter data map(alloc : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
-  if constexpr (__libcpp_is_contiguous_iterator<_Iterator>::value) {
-#  pragma omp target exit data map(release : p[0 : len])
-  } else {
-#  pragma omp target exit data map(release : p)
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Templates for one iterator
-//===----------------------------------------------------------------------===//
-
-// Applying function or lambda in a loop
-
-template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(
-    _Iterator __first, _DifferenceType __n, _Function __f, [[maybe_unused]] const int __device = 0) noexcept {
-  __omp_map_to(__first, __n);
-#  pragma omp target teams distribute parallel for simd device(__device)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(__first[__i]);
-  __omp_map_from(__first, __n);
-
-  return __first + __n;
-}
-
-// Extracting the underlying pointer
-
-template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-  __omp_parallel_for_simd_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
-  return __first + __n;
-}
-
-// Assigning a value in a loop
-
-template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
-    _Index __first, _DifferenceType __n, const _Tp& __value, [[maybe_unused]] const int __device = 0) noexcept {
-  __omp_map_alloc(__first, __n);
-#  pragma omp target teams distribute parallel for simd firstprivate(__value) device(__device)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __first[__i] = __value;
-  __omp_map_from(__first, __n);
-  return __first + __n;
-}
-
-template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index
-__parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
-  __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
-  return __first + __n;
-}
-
-//===----------------------------------------------------------------------===//
-// Templates for two iterators
-//===----------------------------------------------------------------------===//
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
-    _Iterator1 __first1,
-    _DifferenceType __n,
-    _Iterator2 __first2,
-    _Function __f,
-    [[maybe_unused]] const int __device = 0) noexcept {
-  __omp_map_alloc(__first2, __n);
-  __omp_map_to(__first1, __n);
-#  pragma omp target teams distribute parallel for simd device(__device)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first2 + __i) = __f(*(__first1 + __i));
-  __omp_map_from(__first2, __n);
-  __omp_map_free(__first1, __n);
-  return __first1 + __n;
-}
-
-// Extracting the underlying pointer
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator1
-__parallel_for_simd_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
-  __omp_parallel_for_simd_2(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __n,
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __f);
-  return __first1 + __n;
-}
-
-//===----------------------------------------------------------------------===//
-// Templates for three iterator
-//===----------------------------------------------------------------------===//
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
-    _Iterator1 __first1,
-    _DifferenceType __n,
-    _Iterator2 __first2,
-    _Iterator3 __first3,
-    _Function __f,
-    [[maybe_unused]] const int __device = 0) noexcept {
-  __omp_map_to(__first1, __n);
-  __omp_map_to(__first2, __n);
-  __omp_map_alloc(__first3, __n);
-#  pragma omp target teams distribute parallel for simd device(__device)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
-  __omp_map_free(__first1, __n);
-  __omp_map_free(__first2, __n);
-  __omp_map_from(__first3, __n);
-  return __first1 + __n;
-}
-
-// Extracting the underlying pointer
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
-    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
-  __omp_parallel_for_simd_3(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __n,
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __omp_gpu_backend::__omp_extract_base_ptr(__first3),
-      __f);
-  return __first1 + __n;
+  static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
+#pragma omp target exit data map(release : p[0 : len])
 }
 
 //===----------------------------------------------------------------------===//
@@ -237,13 +112,12 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform,                                                                                   \
-        [[maybe_unused]] const int __device = 0) noexcept {                                                            \
-      __omp_map_to(__first, __n);                                                                                      \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) device(__device))                  \
+        _UnaryOperation __transform) noexcept {                                                                         \
+      __omp_gpu_backend::__omp_map_to(__first, __n);                                                                                      \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                    \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
-      __omp_map_free(__first, __n);                                                                                    \
+      __omp_gpu_backend::__omp_map_free(__first, __n);                                                                                    \
       return __init;                                                                                                   \
     }
 
@@ -260,15 +134,14 @@ _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__in
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform,                                                                                   \
-        [[maybe_unused]] const int __device = 0) noexcept {                                                            \
-      __omp_map_to(__first1, __n);                                                                                     \
-      __omp_map_to(__first2, __n);                                                                                     \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) device(__device))                  \
+        _UnaryOperation __transform) noexcept {                                                                         \
+      __omp_gpu_backend::__omp_map_to(__first1, __n);                                                                                     \
+      __omp_gpu_backend::__omp_map_to(__first2, __n);                                                                                     \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                    \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
-      __omp_map_free(__first1, __n);                                                                                   \
-      __omp_map_free(__first2, __n);                                                                                   \
+      __omp_gpu_backend::__omp_map_free(__first1, __n);                                                                                   \
+      __omp_gpu_backend::__omp_map_free(__first2, __n);                                                                                   \
       return __init;                                                                                                   \
     } // namespace __omp_gpu_backend
 
@@ -308,9 +181,8 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
     _DifferenceType __n,
     _Tp __init,
     _BinaryOperation __reduce,
-    _UnaryOperation __transform,
-    [[maybe_unused]] const int __device = 0) noexcept {
-  return __omp_parallel_for_simd_reduction_1(
+    _UnaryOperation __transform) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd_reduction_1(
       __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
 }
 
@@ -326,9 +198,8 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_2(
     _DifferenceType __n,
     _Tp __init,
     _BinaryOperation __reduce,
-    _UnaryOperation __transform,
-    [[maybe_unused]] const int __device = 0) noexcept {
-  return __omp_parallel_for_simd_reduction_2(
+    _UnaryOperation __transform) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd_reduction_2(
       __omp_gpu_backend::__omp_extract_base_ptr(__first1),
       __omp_gpu_backend::__omp_extract_base_ptr(__first2),
       __n,
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index 46e48dd279e89bd..a4c6a2bff9f92fd 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -13,6 +13,8 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/stable_sort.h>
 #include <__config>
+#include <__utility/empty.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -23,10 +25,10 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
-_LIBCPP_HIDE_FROM_ABI void
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_stable_sort(__omp_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+  return __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index e507a16eac965b0..b29a479c17887c7 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -15,6 +15,7 @@
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -24,18 +25,92 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+//===----------------------------------------------------------------------===//
+// Templates for two iterators
+//===----------------------------------------------------------------------===//
+
+template <class _Tp, class _DifferenceType, class _Up, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
+    _Tp* __first1,
+    _DifferenceType __n,
+    _Up* __first2,
+    _Function __f) noexcept {
+  __omp_gpu_backend::__omp_map_alloc(__first2, __n);
+  __omp_gpu_backend::__omp_map_to(__first1, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__first2 + __i) = __f(*(__first1 + __i));
+  __omp_gpu_backend::__omp_map_from(__first2, __n);
+  __omp_gpu_backend::__omp_map_free(__first1, __n);
+  return __empty{};
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__parallel_for_simd(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __n,
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __f);
+}
+
+//===----------------------------------------------------------------------===//
+// Templates for three iterator
+//===----------------------------------------------------------------------===//
+
+template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
+    _Tp* __first1,
+    _DifferenceType __n,
+    _Up* __first2,
+    _Vp* __first3,
+    _Function __f) noexcept {
+  __omp_gpu_backend::__omp_map_to(__first1, __n);
+  __omp_gpu_backend::__omp_map_to(__first2, __n);
+  __omp_gpu_backend::__omp_map_alloc(__first3, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
+  __omp_gpu_backend::__omp_map_free(__first1, __n);
+  __omp_gpu_backend::__omp_map_free(__first2, __n);
+  __omp_gpu_backend::__omp_map_from(__first3, __n);
+  return __empty{};
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd(
+    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
+  return __omp_gpu_backend::__omp_parallel_for_simd(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __n,
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __omp_gpu_backend::__omp_extract_base_ptr(__first3),
+      __f);
+}
+
+}
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
-_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
     __omp_backend_tag,
     _ForwardIterator __first,
     _ForwardIterator __last,
     _ForwardOutIterator __result,
     _UnaryOperation __op) {
-  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
+    std::__par_backend::__parallel_for_simd(__first, __last - __first, __result, __op);
     return __result + (__last - __first);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
@@ -48,19 +123,19 @@ template <class _ExecutionPolicy,
           class _ForwardOutIterator,
           class _BinaryOperation,
           enable_if_t<is_execution_policy_v<__remove_cvref_t<_ExecutionPolicy>>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
     __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
     _ForwardIterator2 __first2,
     _ForwardOutIterator __result,
     _BinaryOperation __op) {
-  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
+    std::__par_backend::__parallel_for_simd(__first1, __last1 - __first1, __first2, __result, __op);
     return __result + (__last1 - __first1);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index eb0a7ba52d8fdcc..ddc3ba03f8e9b08 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -18,6 +18,7 @@
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/operation_traits.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -56,7 +57,7 @@ template <class _ExecutionPolicy,
           class _Tp,
           class _BinaryOperation1,
           class _BinaryOperation2>
-_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
+_LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     __omp_backend_tag,
     _ForwardIterator1 __first1,
     _ForwardIterator1 __last1,
@@ -66,8 +67,10 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
     _BinaryOperation2 __transform) {
   // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && 
+                is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
@@ -81,16 +84,17 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
 //===----------------------------------------------------------------------===//
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
-_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
+_LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     __omp_backend_tag,
     _ForwardIterator __first,
     _ForwardIterator __last,
     _Tp __init,
     _BinaryOperation __reduce,
     _UnaryOperation __transform) {
-  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && 
+                is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);

>From 3a983bda0f6805b956ae4844c62d45e4e636ca85 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 16 Oct 2023 08:06:44 -0700
Subject: [PATCH 16/46] Clang formatted files

---
 .../__algorithm/pstl_backends/openmp/fill.h   | 13 ++++----
 .../pstl_backends/openmp/find_if.h            | 19 ++++++-----
 .../pstl_backends/openmp/for_each.h           | 15 +++++----
 .../pstl_backends/openmp/omp_offload.h        | 32 +++++++++----------
 .../pstl_backends/openmp/transform.h          | 19 ++++-------
 .../pstl_backends/openmp/transform_reduce.h   |  6 ++--
 6 files changed, 50 insertions(+), 54 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 250e514b4d526fd..029b1877e309c16 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -29,12 +29,12 @@ namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
 template <class _Tp, class _DifferenceType, class _Up>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd_val(
-    _Tp* __first, _DifferenceType __n, const _Up& __value) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__omp_parallel_for_simd_val(_Tp* __first, _DifferenceType __n, const _Up& __value) noexcept {
   __omp_gpu_backend::__omp_map_alloc(__first, __n);
 #  pragma omp target teams distribute parallel for simd firstprivate(__value)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first+__i) = __value;
+    *(__first + __i) = __value;
   __omp_gpu_backend::__omp_map_from(__first, __n);
   return __empty{};
 }
@@ -42,11 +42,12 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd_val(
 template <class _ForwardIterator, class _DifferenceType, class _Tp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __parallel_for_simd_val(_ForwardIterator __first, _DifferenceType __n, const _Tp& __value) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd_val(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
+  return __omp_gpu_backend::__omp_parallel_for_simd_val(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
 }
 
-}
-}
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 94f7c42953c2a4a..19b581a5f72b00a 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -27,12 +27,13 @@ namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
 template <class _Tp, class _DifferenceType, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI _DifferenceType  __omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
+_LIBCPP_HIDE_FROM_ABI _DifferenceType
+__omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
   __omp_gpu_backend::__omp_map_to(__first, __n);
   _DifferenceType idx = __n;
-#  pragma omp target teams distribute parallel for simd reduction(min:idx)
-  for (_DifferenceType __i = 0; __i < __n; ++__i){
-    if (__pred(*(__first+__i))) {
+#  pragma omp target teams distribute parallel for simd reduction(min : idx)
+  for (_DifferenceType __i = 0; __i < __n; ++__i) {
+    if (__pred(*(__first + __i))) {
       idx = (__i < idx) ? __i : idx;
     }
   }
@@ -43,12 +44,14 @@ _LIBCPP_HIDE_FROM_ABI _DifferenceType  __omp_parallel_for_min_idx(_Tp* __first,
 // Extracting the underlying pointer
 
 template <class _ForwardIterator, class _DifferenceType, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator> __parallel_for_min_idx(_ForwardIterator __first, _DifferenceType __n, _Predicate __pred) noexcept {
-  return __first + __omp_gpu_backend::__omp_parallel_for_min_idx(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __pred);
+_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
+__parallel_for_min_idx(_ForwardIterator __first, _DifferenceType __n, _Predicate __pred) noexcept {
+  return __first +
+         __omp_gpu_backend::__omp_parallel_for_min_idx(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __pred);
 }
 
-}
-}
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index 444352b4a5e51a8..eea337179b6576c 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -14,8 +14,8 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__type_traits/is_execution_policy.h>
-#include <optional>
 #include <__utility/empty.h>
+#include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -29,12 +29,12 @@ namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
 template <class _Tp, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
-    _Tp* __first, _DifferenceType __n, _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__omp_parallel_for_simd(_Tp* __first, _DifferenceType __n, _Function __f) noexcept {
   __omp_gpu_backend::__omp_map_to(__first, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(*(__first+__i));
+    __f(*(__first + __i));
   __omp_gpu_backend::__omp_map_from(__first, __n);
   return __empty{};
 }
@@ -42,12 +42,13 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
 // Extracting the underlying pointer
 
 template <class _ForwardIterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd_1(_ForwardIterator __first, _DifferenceType __n, _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__parallel_for_simd_1(_ForwardIterator __first, _DifferenceType __n, _Function __f) noexcept {
   return __omp_gpu_backend::__omp_parallel_for_simd(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
 }
 
-}
-}
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 77afb35ee71b4bb..183fdfa5cfa02a5 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -9,8 +9,8 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 
-#include <__assert>
 #include <__algorithm/unwrap_iter.h>
+#include <__assert>
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/iterator_traits.h>
@@ -19,8 +19,8 @@
 #include <__memory/pointer_traits.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
-#include <__utility/move.h>
 #include <__utility/empty.h>
+#include <__utility/move.h>
 #include <cstddef>
 #include <optional>
 
@@ -70,28 +70,28 @@ template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_to([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#pragma omp target enter data map(to : p[0 : len])
+#  pragma omp target enter data map(to : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_from([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#pragma omp target exit data map(from : p[0 : len])
+#  pragma omp target exit data map(from : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_alloc([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#pragma omp target enter data map(alloc : p[0 : len])
+#  pragma omp target enter data map(alloc : p[0 : len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#pragma omp target exit data map(release : p[0 : len])
+#  pragma omp target exit data map(release : p[0 : len])
 }
 
 //===----------------------------------------------------------------------===//
@@ -112,12 +112,12 @@ __omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _Diffe
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform) noexcept {                                                                         \
-      __omp_gpu_backend::__omp_map_to(__first, __n);                                                                                      \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                    \
+        _UnaryOperation __transform) noexcept {                                                                        \
+      __omp_gpu_backend::__omp_map_to(__first, __n);                                                                   \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
-      __omp_gpu_backend::__omp_map_free(__first, __n);                                                                                    \
+      __omp_gpu_backend::__omp_map_free(__first, __n);                                                                 \
       return __init;                                                                                                   \
     }
 
@@ -134,14 +134,14 @@ _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__in
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform) noexcept {                                                                         \
-      __omp_gpu_backend::__omp_map_to(__first1, __n);                                                                                     \
-      __omp_gpu_backend::__omp_map_to(__first2, __n);                                                                                     \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                    \
+        _UnaryOperation __transform) noexcept {                                                                        \
+      __omp_gpu_backend::__omp_map_to(__first1, __n);                                                                  \
+      __omp_gpu_backend::__omp_map_to(__first2, __n);                                                                  \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
-      __omp_gpu_backend::__omp_map_free(__first1, __n);                                                                                   \
-      __omp_gpu_backend::__omp_map_free(__first2, __n);                                                                                   \
+      __omp_gpu_backend::__omp_map_free(__first1, __n);                                                                \
+      __omp_gpu_backend::__omp_map_free(__first2, __n);                                                                \
       return __init;                                                                                                   \
     } // namespace __omp_gpu_backend
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index b29a479c17887c7..ba218dddd4be563 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -33,11 +33,8 @@ inline namespace __omp_gpu_backend {
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
-    _Tp* __first1,
-    _DifferenceType __n,
-    _Up* __first2,
-    _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__omp_parallel_for_simd(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Function __f) noexcept {
   __omp_gpu_backend::__omp_map_alloc(__first2, __n);
   __omp_gpu_backend::__omp_map_to(__first1, __n);
 #  pragma omp target teams distribute parallel for simd
@@ -65,12 +62,8 @@ __parallel_for_simd(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __omp_parallel_for_simd(
-    _Tp* __first1,
-    _DifferenceType __n,
-    _Up* __first2,
-    _Vp* __first3,
-    _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty>
+__omp_parallel_for_simd(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Vp* __first3, _Function __f) noexcept {
   __omp_gpu_backend::__omp_map_to(__first1, __n);
   __omp_gpu_backend::__omp_map_to(__first2, __n);
   __omp_gpu_backend::__omp_map_alloc(__first3, __n);
@@ -96,8 +89,8 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd(
       __f);
 }
 
-}
-}
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index ddc3ba03f8e9b08..1427184d04d0c83 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -69,8 +69,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && 
-                is_arithmetic_v<_Tp> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_2(
         __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
@@ -93,8 +92,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     _UnaryOperation __transform) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && 
-                is_arithmetic_v<_Tp> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
     return std::__par_backend::__parallel_for_simd_reduction_1(
         __first, __last - __first, __init, __reduce, __transform);

>From 49eee187e7b4ae8dd8a73e00018f0be8fd5c76eb Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 19 Oct 2023 11:35:54 -0700
Subject: [PATCH 17/46] Fixing issued discussed during code review on the 16th
 of October

- Updated CMake logic
- Made `fill` call `transform` with a lambda returning a constant.
- Used `__rewrap_iter` and `__unwrap_iter`.
---
 libcxx/CMakeLists.txt                         |  13 +-
 .../pstl_backends/openmp/backend.h            |   8 +
 .../__algorithm/pstl_backends/openmp/fill.h   |  31 +---
 .../pstl_backends/openmp/find_if.h            |  27 +---
 .../pstl_backends/openmp/for_each.h           |  29 +---
 .../pstl_backends/openmp/omp_offload.h        | 144 ++----------------
 .../pstl_backends/openmp/transform.h          |  67 +++-----
 .../pstl_backends/openmp/transform_reduce.h   | 130 ++++++++++++----
 8 files changed, 163 insertions(+), 286 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 24dd562035d24ed..0fdeab59e89f767 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -295,15 +295,14 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API
   "Build libc++ with an externalized threading API.
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
 
-set(LIBCXX_PSTL_BACKEND "openmp" CACHE INTERNAL "Which PSTL backend to use")
-if (LIBCXX_PSTL_BACKEND STREQUAL "")
-  if (LIBCXX_ENABLE_THREADS)
-    set(LIBCXX_PSTL_BACKEND "std-thread")
-  else()
-    set(LIBCXX_PSTL_BACKEND "serial")
-  endif()
+if (LIBCXX_ENABLE_THREADS)
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "std-thread")
+else()
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "serial")
 endif()
 
+set(LIBCXX_PSTL_BACKEND "${LIBCXX_PSTL_BACKEND_DEFAULT}" CACHE STRING "Select the PSTL backend to use. Valid values are serial, std-thread, libdispatch, openmp. Default: ${LIBCXX_PSTL_BACKEND_DEFAULT}")
+
 # Misc options ----------------------------------------------------------------
 # FIXME: Turn -pedantic back ON. It is currently off because it warns
 # about #include_next which is used everywhere.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index e4e6136082a3427..9396f91b1a755f2 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -20,6 +20,14 @@
 
 #if _LIBCPP_STD_VER >= 17
 
+#if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+# if !defined(_OPENMP)
+//#   warning "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+# elif (defined(_OPENMP) && _OPENMP < 201511)
+//#   warning "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
+# endif
+#endif
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 struct __omp_backend_tag {};
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 029b1877e309c16..e8101233e342a3e 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -13,7 +13,9 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
+#include <__type_traits/remove_pointer.h>
 #include <__utility/empty.h>
 #include <optional>
 
@@ -25,30 +27,6 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-namespace __par_backend {
-inline namespace __omp_gpu_backend {
-
-template <class _Tp, class _DifferenceType, class _Up>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_parallel_for_simd_val(_Tp* __first, _DifferenceType __n, const _Up& __value) noexcept {
-  __omp_gpu_backend::__omp_map_alloc(__first, __n);
-#  pragma omp target teams distribute parallel for simd firstprivate(__value)
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first + __i) = __value;
-  __omp_gpu_backend::__omp_map_from(__first, __n);
-  return __empty{};
-}
-
-template <class _ForwardIterator, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for_simd_val(_ForwardIterator __first, _DifferenceType __n, const _Tp& __value) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd_val(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
-}
-
-} // namespace __omp_gpu_backend
-} // namespace __par_backend
-
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
@@ -58,7 +36,10 @@ __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return std::__par_backend::__parallel_for_simd_val(__first, __last - __first, __value);
+    std::__rewrap_iter(__first, __par_backend::__omp_for_each(std::__unwrap_iter(__first), __last - __first, [&](std::remove_pointer_t<decltype(std::__unwrap_iter(__first))>& e) {
+          e = __value;
+        }));
+    return __empty{};
   }
   // Otherwise, we execute fill on the CPU instead
   else {
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 19b581a5f72b00a..f07965885c9c68c 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <__iterator/wrap_iter.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -23,13 +24,9 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-namespace __par_backend {
-inline namespace __omp_gpu_backend {
-
 template <class _Tp, class _DifferenceType, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI _DifferenceType
-__omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
-  __omp_gpu_backend::__omp_map_to(__first, __n);
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_find_if(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
+  __par_backend::__omp_map_to(__first, __n);
   _DifferenceType idx = __n;
 #  pragma omp target teams distribute parallel for simd reduction(min : idx)
   for (_DifferenceType __i = 0; __i < __n; ++__i) {
@@ -37,29 +34,17 @@ __omp_parallel_for_min_idx(_Tp* __first, _DifferenceType __n, _Predicate __pred)
       idx = (__i < idx) ? __i : idx;
     }
   }
-  __omp_gpu_backend::__omp_map_free(__first, __n);
-  return idx;
-}
-
-// Extracting the underlying pointer
-
-template <class _ForwardIterator, class _DifferenceType, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
-__parallel_for_min_idx(_ForwardIterator __first, _DifferenceType __n, _Predicate __pred) noexcept {
-  return __first +
-         __omp_gpu_backend::__omp_parallel_for_min_idx(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __pred);
+  __par_backend::__omp_map_free(__first, __n);
+  return __first + idx;
 }
 
-} // namespace __omp_gpu_backend
-} // namespace __par_backend
-
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
 __pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return __par_backend::__parallel_for_min_idx(__first, __last - __first, __pred);
+    return std::__rewrap_iter(__first, std::__omp_find_if(std::__unwrap_iter(__first), __last - __first, __pred));
   } else {
     return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index eea337179b6576c..eb8caf23971cb33 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
+#include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/empty.h>
 #include <optional>
@@ -25,31 +26,6 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-namespace __par_backend {
-inline namespace __omp_gpu_backend {
-
-template <class _Tp, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_parallel_for_simd(_Tp* __first, _DifferenceType __n, _Function __f) noexcept {
-  __omp_gpu_backend::__omp_map_to(__first, __n);
-#  pragma omp target teams distribute parallel for simd
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(*(__first + __i));
-  __omp_gpu_backend::__omp_map_from(__first, __n);
-  return __empty{};
-}
-
-// Extracting the underlying pointer
-
-template <class _ForwardIterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for_simd_1(_ForwardIterator __first, _DifferenceType __n, _Function __f) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
-}
-
-} // namespace __omp_gpu_backend
-} // namespace __par_backend
-
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
@@ -59,7 +35,8 @@ __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    return std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+    __par_backend::__omp_for_each(std::__unwrap_iter(__first), __last - __first, __func);
+    return __empty{};
   }
   // Else we fall back to the serial backend
   else {
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 183fdfa5cfa02a5..2e44af3c9a761f5 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -22,7 +22,6 @@
 #include <__utility/empty.h>
 #include <__utility/move.h>
 #include <cstddef>
-#include <optional>
 
 // is_same
 
@@ -43,27 +42,9 @@ namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
 //===----------------------------------------------------------------------===//
-// Functions for eaxtracting the pase pointers
-//===----------------------------------------------------------------------===//
-
-// In the general case we do not need to extract it. This is for instance the
-// case for pointers.
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) noexcept {
-  return std::__unwrap_iter(p);
-}
-
-// For vectors and arrays, etc, we need to extract the underlying base pointer.
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w) noexcept {
-  std::pointer_traits<std::__wrap_iter<_Tp>> PT;
-  return PT.to_address(w);
-}
-
-//===----------------------------------------------------------------------===//
-// The following four functions differentiates between contiguous iterators and
-// non-contiguous iterators. That allows to use the same implementations for
-// reference and value iterators
+// The following four functions can be used to map contiguous array sections to
+// and from the device. For now, they are simple overlays of the OpenMP pragmas,
+// but they should be updated wen adding support for other iterator types.
 //===----------------------------------------------------------------------===//
 
 template <class _Iterator, class _DifferenceType>
@@ -95,117 +76,18 @@ __omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _Diffe
 }
 
 //===----------------------------------------------------------------------===//
-// Templates for reductions
+// The OpenMP implementation of for_each is shared between for_each and fill
 //===----------------------------------------------------------------------===//
 
-// In the two following function templates, we map the pointer to the device in
-// different ways depending on if they are contiguou or not.
-
-#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                  \
-    template <class _Iterator,                                                                                         \
-              class _DifferenceType,                                                                                   \
-              typename _Tp,                                                                                            \
-              typename _BinaryOperationType,                                                                           \
-              typename _UnaryOperation>                                                                                \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1(                                                     \
-        _Iterator __first,                                                                                             \
-        _DifferenceType __n,                                                                                           \
-        _Tp __init,                                                                                                    \
-        std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform) noexcept {                                                                        \
-      __omp_gpu_backend::__omp_map_to(__first, __n);                                                                   \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
-        __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
-      __omp_gpu_backend::__omp_map_free(__first, __n);                                                                 \
-      return __init;                                                                                                   \
-    }
-
-#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                  \
-    template <class _Iterator1,                                                                                        \
-              class _Iterator2,                                                                                        \
-              class _DifferenceType,                                                                                   \
-              typename _Tp,                                                                                            \
-              typename _BinaryOperationType,                                                                           \
-              typename _UnaryOperation >                                                                               \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2(                                                     \
-        _Iterator1 __first1,                                                                                           \
-        _Iterator2 __first2,                                                                                           \
-        _DifferenceType __n,                                                                                           \
-        _Tp __init,                                                                                                    \
-        std_op<_BinaryOperationType> __reduce,                                                                         \
-        _UnaryOperation __transform) noexcept {                                                                        \
-      __omp_gpu_backend::__omp_map_to(__first1, __n);                                                                  \
-      __omp_gpu_backend::__omp_map_to(__first2, __n);                                                                  \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
-      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
-        __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
-      __omp_gpu_backend::__omp_map_free(__first1, __n);                                                                \
-      __omp_gpu_backend::__omp_map_free(__first2, __n);                                                                \
-      return __init;                                                                                                   \
-    } // namespace __omp_gpu_backend
-
-#  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
-    __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                        \
-    __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
-
-// Addition
-__PSTL_OMP_SIMD_REDUCTION(+, std::plus)
-
-// Subtraction
-__PSTL_OMP_SIMD_REDUCTION(-, std::minus)
-
-// Multiplication
-__PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
-
-// Logical and
-__PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
-
-// Logical or
-__PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
-
-// Bitwise and
-__PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
-
-// Bitwise or
-__PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
-
-// Bitwise xor
-__PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
-
-// Extracting the underlying pointers
-
-template <class _Iterator, class _DifferenceType, typename _Tp, typename _BinaryOperation, typename _UnaryOperation >
-_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
-    _Iterator __first,
-    _DifferenceType __n,
-    _Tp __init,
-    _BinaryOperation __reduce,
-    _UnaryOperation __transform) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd_reduction_1(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
-}
-
-template <class _Iterator1,
-          class _Iterator2,
-          class _DifferenceType,
-          typename _Tp,
-          typename _BinaryOperation,
-          typename _UnaryOperation >
-_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_2(
-    _Iterator1 __first1,
-    _Iterator2 __first2,
-    _DifferenceType __n,
-    _Tp __init,
-    _BinaryOperation __reduce,
-    _UnaryOperation __transform) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd_reduction_2(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __n,
-      __init,
-      __reduce,
-      __transform);
+template <class _Tp, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp*
+__omp_for_each(_Tp* __first, _DifferenceType __n, _Function __f) noexcept {
+  __par_backend::__omp_map_to(__first, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(*(__first + __i));
+  __par_backend::__omp_map_from(__first, __n);
+  return __first + __n;
 }
 
 } // namespace __omp_gpu_backend
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index ba218dddd4be563..f7e52d7933b6c40 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -14,6 +14,7 @@
 #include <__algorithm/transform.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
 #include <optional>
 
@@ -25,73 +26,39 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-namespace __par_backend {
-inline namespace __omp_gpu_backend {
-
 //===----------------------------------------------------------------------===//
-// Templates for two iterators
+// OpenMP implementations of transform for one and two input iterators and one
+// output iterator
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_parallel_for_simd(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Function __f) noexcept {
-  __omp_gpu_backend::__omp_map_alloc(__first2, __n);
-  __omp_gpu_backend::__omp_map_to(__first1, __n);
+__omp_transform(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Function __f) noexcept {
+  __par_backend::__omp_map_alloc(__first2, __n);
+  __par_backend::__omp_map_to(__first1, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     *(__first2 + __i) = __f(*(__first1 + __i));
-  __omp_gpu_backend::__omp_map_from(__first2, __n);
-  __omp_gpu_backend::__omp_map_free(__first1, __n);
+  __par_backend::__omp_map_from(__first2, __n);
+  __par_backend::__omp_map_free(__first1, __n);
   return __empty{};
 }
 
-// Extracting the underlying pointer
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for_simd(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __n,
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __f);
-}
-
-//===----------------------------------------------------------------------===//
-// Templates for three iterator
-//===----------------------------------------------------------------------===//
-
 template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_parallel_for_simd(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Vp* __first3, _Function __f) noexcept {
-  __omp_gpu_backend::__omp_map_to(__first1, __n);
-  __omp_gpu_backend::__omp_map_to(__first2, __n);
-  __omp_gpu_backend::__omp_map_alloc(__first3, __n);
+__omp_transform(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Vp* __first3, _Function __f) noexcept {
+  __par_backend::__omp_map_to(__first1, __n);
+  __par_backend::__omp_map_to(__first2, __n);
+  __par_backend::__omp_map_alloc(__first3, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
-  __omp_gpu_backend::__omp_map_free(__first1, __n);
-  __omp_gpu_backend::__omp_map_free(__first2, __n);
-  __omp_gpu_backend::__omp_map_from(__first3, __n);
+  __par_backend::__omp_map_free(__first1, __n);
+  __par_backend::__omp_map_free(__first2, __n);
+  __par_backend::__omp_map_from(__first3, __n);
   return __empty{};
 }
 
-// Extracting the underlying pointer
-
-template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_for_simd(
-    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
-  return __omp_gpu_backend::__omp_parallel_for_simd(
-      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
-      __n,
-      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
-      __omp_gpu_backend::__omp_extract_base_ptr(__first3),
-      __f);
-}
-
-} // namespace __omp_gpu_backend
-} // namespace __par_backend
-
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
     __omp_backend_tag,
@@ -103,7 +70,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__par_backend::__parallel_for_simd(__first, __last - __first, __result, __op);
+    std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op);
     return __result + (__last - __first);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
@@ -128,7 +95,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__par_backend::__parallel_for_simd(__first1, __last1 - __first1, __first2, __result, __op);
+    std::__omp_transform(std::__unwrap_iter(__first1), __last1 - __first1, std::__unwrap_iter(__first2), std::__unwrap_iter(__result), __op);
     return __result + (__last1 - __first1);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index 1427184d04d0c83..3798e1cb3f3fac1 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__iterator/wrap_iter.h>
 #include <__numeric/transform_reduce.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_arithmetic.h>
@@ -28,6 +29,87 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+//===----------------------------------------------------------------------===//
+// Templates for predefined reductions
+//===----------------------------------------------------------------------===//
+
+#  define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                  \
+    template <class _Iterator,                                                                                         \
+              class _DifferenceType,                                                                                   \
+              typename _Tp,                                                                                            \
+              typename _BinaryOperationType,                                                                           \
+              typename _UnaryOperation>                                                                                \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                     \
+        _Iterator __first,                                                                                             \
+        _DifferenceType __n,                                                                                           \
+        _Tp __init,                                                                                                    \
+        std_op<_BinaryOperationType> __reduce,                                                                         \
+        _UnaryOperation __transform) noexcept {                                                                        \
+      __par_backend::__omp_map_to(__first, __n);                                                                   \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
+        __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
+      __par_backend::__omp_map_free(__first, __n);                                                                 \
+      return __init;                                                                                                   \
+    }
+
+#  define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)                                                                  \
+    template <class _Iterator1,                                                                                        \
+              class _Iterator2,                                                                                        \
+              class _DifferenceType,                                                                                   \
+              typename _Tp,                                                                                            \
+              typename _BinaryOperationType,                                                                           \
+              typename _UnaryOperation >                                                                               \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                     \
+        _Iterator1 __first1,                                                                                           \
+        _Iterator2 __first2,                                                                                           \
+        _DifferenceType __n,                                                                                           \
+        _Tp __init,                                                                                                    \
+        std_op<_BinaryOperationType> __reduce,                                                                         \
+        _UnaryOperation __transform) noexcept {                                                                        \
+      __par_backend::__omp_map_to(__first1, __n);                                                                  \
+      __par_backend::__omp_map_to(__first2, __n);                                                                  \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
+      for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
+        __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
+      __par_backend::__omp_map_free(__first1, __n);                                                                \
+      __par_backend::__omp_map_free(__first2, __n);                                                                \
+      return __init;                                                                                                   \
+    }
+
+#  define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op)                                                                    \
+    __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op)                                                                        \
+    __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
+
+// Addition
+__PSTL_OMP_SIMD_REDUCTION(+, std::plus)
+
+// Subtraction
+__PSTL_OMP_SIMD_REDUCTION(-, std::minus)
+
+// Multiplication
+__PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
+
+// Logical and
+__PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
+
+// Logical or
+__PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
+
+// Bitwise and
+__PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
+
+// Bitwise or
+__PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
+
+// Bitwise xor
+__PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
+
+//===----------------------------------------------------------------------===//
+// The following struct is used to determine whether a reduction is supported by
+// the OpenMP backend.
+//===----------------------------------------------------------------------===//
+
 template <class _T1, class _T2, class _T3>
 struct __is_supported_reduction : std::false_type {};
 
@@ -48,9 +130,28 @@ __PSTL_IS_SUPPORTED_REDUCTION(bit_or)
 __PSTL_IS_SUPPORTED_REDUCTION(bit_xor)
 
 //===----------------------------------------------------------------------===//
-// Two input iterators
+// Implementation of PSTL transform_reduce for one and two input iterators
 //===----------------------------------------------------------------------===//
 
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
+_LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
+    __omp_backend_tag,
+    _ForwardIterator __first,
+    _ForwardIterator __last,
+    _Tp __init,
+    _BinaryOperation __reduce,
+    _UnaryOperation __transform) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+                __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
+    return std::__omp_transform_reduce(
+        std::__unwrap_iter(__first), __last - __first, __init, __reduce, __transform);
+  }
+  return std::__pstl_transform_reduce<_ExecutionPolicy>(
+      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
+}
+
 template <class _ExecutionPolicy,
           class _ForwardIterator1,
           class _ForwardIterator2,
@@ -71,36 +172,13 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value) {
-    return std::__par_backend::__parallel_for_simd_reduction_2(
-        __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
+    return std::__omp_transform_reduce(
+        std::__unwrap_iter(__first1), std::__unwrap_iter(__first2), __last1 - __first1, __init, __reduce, __transform);
   }
   return std::__pstl_transform_reduce<_ExecutionPolicy>(
       __cpu_backend_tag{}, __first1, __last1, __first2, std::move(__init), __reduce, __transform);
 }
 
-//===----------------------------------------------------------------------===//
-// One input iterator
-//===----------------------------------------------------------------------===//
-
-template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
-_LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
-    __omp_backend_tag,
-    _ForwardIterator __first,
-    _ForwardIterator __last,
-    _Tp __init,
-    _BinaryOperation __reduce,
-    _UnaryOperation __transform) {
-  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                __is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
-                __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
-    return std::__par_backend::__parallel_for_simd_reduction_1(
-        __first, __last - __first, __init, __reduce, __transform);
-  }
-  return std::__pstl_transform_reduce<_ExecutionPolicy>(
-      __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
-}
-
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17

>From cbec2f431d7562617b2905d09cb5847136c1d7d9 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 19 Oct 2023 17:55:42 -0700
Subject: [PATCH 18/46] Added OpenMP offloading documentation to libcxx

---
 libcxx/docs/BuildingLibcxx.rst |  11 ++++
 libcxx/docs/UsingLibcxx.rst    | 107 +++++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+)

diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst
index 2cee97c03ced089..cbd6183cbd35660 100644
--- a/libcxx/docs/BuildingLibcxx.rst
+++ b/libcxx/docs/BuildingLibcxx.rst
@@ -408,6 +408,17 @@ libc++ Feature Options
   Use the specified GCC toolchain and standard library when building the native
   stdlib benchmark tests.
 
+.. option:: LIBCXX_PSTL_BACKEND:STRING
+
+  **Default**:: ``"serial"``
+
+  **Values**:: ``serial``, ``std-thread``, ``libdispatch``, ``openmp``
+
+  Select the desired backend for C++ parallel algorithms. All four options can
+  target multi-core CPU architectures, and ``openmp`` can additionally target
+  GPU architectures. The ``openmp`` backend requires OpenMP version 4.5 or
+  later.
+
 
 libc++ ABI Feature Options
 --------------------------
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index 52c76f3b10548fb..9ef4814d00254f4 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -466,6 +466,113 @@ Unpoisoning may not be an option, if (for example) you are not maintaining the a
 * You are using allocator, which does not call destructor during deallocation.
 * You are aware that memory allocated with an allocator may be accessed, even when unused by container.
 
+Offloading C++ Parallel Algorithms to GPUs
+------------------------------------------
+
+Experimental support for GPU offloading has been added to ``libc++``. The
+implementation uses OpenMP target offloading to leverage GPU compute resources.
+The OpenMP PSTL backend can target both NVIDIA and AMD GPUs.
+However, the implementation only supports contiguous iterators, such as 
+iterators for ``std::vector`` or ``std::array``.
+To enable the OpenMP offloading backend it must be selected with
+``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when
+compiling a program, the user must specify the command line options
+``-fopenmp -fexperimental-library -stdlib=libc++``. To install LLVM with OpenMP
+offloading enabled, please read
+`the LLVM OpenMP FAQ. <https://openmp.llvm.org/SupportAndFAQ.html>`_ 
+You may also want to to visit
+`the OpenMP offloading command-line argument reference. <https://openmp.llvm.org/CommandLineArgumentReference.html#offload-command-line-arguments>`_ 
+
+Example
+~~~~~~~
+
+The following is an example of offloading vector addition to a GPU using our
+standard library extension.
+
+.. code-block:: cpp
+
+  #include <algorithm>
+  #include <execution>
+
+  template<typename T1, typename T2, typename T3>
+  void axpy(const T1 a,std::vector<T2>& x, std::vector<T3>& y)
+  {
+    std::transform(std::execution::par_unseq,x.begin(),x.end(), y.begin(), y.begin(),
+                  [=](T2 xi, T3 yi){ return a*xi + yi; });
+  }
+
+The execution policy ``std::execution::par_unseq`` states that the algorithm's
+execution may be parallelized, vectorized, and migrated across threads. This is
+the only execution mode that is safe to offload to GPUs, and for all other
+execution modes the algorithms will execute on the CPU.
+Special attention must be paid to the lambda captures when enabling GPU
+offloading. If the lambda captures by reference, the user must manually map the
+variables to the device. If capturing by reference, the above example could
+be implemented in the following way.
+
+.. code-block:: cpp
+
+  template<typename T1, typename T2, typename T3>
+  void axpy(const T1 a,std::vector<T2>& x, std::vector<T3>& y)
+  {
+  # pragma omp target data map(to:a)
+    std::transform(std::execution::par_unseq,x.begin(),x.end(), y.begin(), y.begin(),
+                  [&](T2 xi, T3 yi){ return a*xi + yi; });
+  }
+
+However, if unified shared memory, USM, is enabled, no additional data mapping
+is necessary when capturing y reference.
+
+Compiling functions for GPUs with OpenMP
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The C++ standard defines that all accesses to memory are inside a single address
+space. However, discrete GPU systems have distinct address spaces. A single
+address space can be emulated if your system supports unified shared memory.
+However, many discrete GPU systems do not, and in those cases it is important to
+pass device function pointers to the parallel algorithms. Below is an example of
+how the OpenMP `declare target` directive can be used to mark that a function
+should be compiled for both host and device. The device address of a function
+pointer can be obtained with `target map(from:<list of identifiers>)`.
+
+.. code-block:: cpp
+
+  // Declare that the function must be compiled for both host and device
+  #pragma omp declare target
+  void cube(int& n) {n*=n*n; };
+  #pragma omp end declare target
+
+  int main()
+  {
+    int * a =  new int[LEN];
+    // Initialize the array to 2 on the device
+    std::fill(std::execution::par_unseq,a, a+LEN,2);
+    // Get the device pointer for cube
+    void (*dcube)(int& n);
+    #pragma omp target map(from:dcube)
+    dcube = &cube;
+    // Pass the device function pointer to the parallel algorithm
+    std::for_each(std::execution::par_unseq,a, a+LEN,dcube);
+    // Validate that the result is 8 on the host for all array indices
+    std::for_each(std::execution::par,a, a+LEN,[&](int & n){
+      assert(n == 8);
+    });
+    delete[] a;
+    return 0;
+  }
+
+Without unified shared memory, the above example will not work if the host
+function pointer `cube` is passed to the parallel algorithm.
+
+Important notes about exception handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+GPU architectures do not support exception handling. If compiling a program
+containing parallel algorithms with ``clang`` 18 or newer, a program with
+exceptions in offloaded code regions will compile, but the program will
+terminate if an exception is thrown on the device. This does not conform with
+the C++ standard and exception handling on GPUs will hopefully be better
+supported in future releases of LLVM.
+
 Platform specific behavior
 ==========================
 

>From 00d5c0c4761242e48d45a2e89ea34c1013d744f9 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 11:26:26 -0700
Subject: [PATCH 19/46] Changed argument names to reflect input and output

---
 .../pstl_backends/openmp/backend.h            | 15 +++---
 .../__algorithm/pstl_backends/openmp/fill.h   | 20 +++++---
 .../pstl_backends/openmp/find_if.h            |  5 +-
 .../pstl_backends/openmp/for_each.h           | 17 +++++--
 .../pstl_backends/openmp/omp_offload.h        | 21 +-------
 .../pstl_backends/openmp/transform.h          | 51 +++++++++++++------
 .../pstl_backends/openmp/transform_reduce.h   | 20 ++++----
 7 files changed, 85 insertions(+), 64 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index 9396f91b1a755f2..99245a7d13e9917 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -20,13 +20,14 @@
 
 #if _LIBCPP_STD_VER >= 17
 
-#if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
-# if !defined(_OPENMP)
-//#   warning "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
-# elif (defined(_OPENMP) && _OPENMP < 201511)
-//#   warning "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
-# endif
-#endif
+#  if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+#    if !defined(_OPENMP)
+// #   warning "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+#    elif (defined(_OPENMP) && _OPENMP < 201511)
+// #   warning "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent
+// version of OpenMP."
+#    endif
+#  endif
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index e8101233e342a3e..4262f778fa9121c 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -27,18 +27,26 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _Tp, class _DifferenceType, class _Up>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_fill(_Tp* __out1, _DifferenceType __n, const _Up& __value) noexcept {
+  __par_backend::__omp_map_alloc(__out1, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    *(__out1 + __i) = __value;
+  __par_backend::__omp_map_from(__out1, __n);
+  return __out1 + __n;
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
-  // It is only safe to execute fill on the GPU, it the execution policy is
-  // parallel unsequenced, as it is the only execution policy allowing
-  // SIMD instructions
+  // If it is safe to offload the computations to the GPU, we call the OpenMP
+  // implementation of for_each. In the case of fill, we provide for_Each with a
+  // lambda returning a constant.
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    std::__rewrap_iter(__first, __par_backend::__omp_for_each(std::__unwrap_iter(__first), __last - __first, [&](std::remove_pointer_t<decltype(std::__unwrap_iter(__first))>& e) {
-          e = __value;
-        }));
+    std::__rewrap_iter(__first, std::__omp_fill(std::__unwrap_iter(__first), __last - __first, __value));
     return __empty{};
   }
   // Otherwise, we execute fill on the CPU instead
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index f07965885c9c68c..8d7b196ec176f71 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -34,17 +34,20 @@ _LIBCPP_HIDE_FROM_ABI _Tp* __omp_find_if(_Tp* __first, _DifferenceType __n, _Pre
       idx = (__i < idx) ? __i : idx;
     }
   }
-  __par_backend::__omp_map_free(__first, __n);
+  __par_backend::__omp_map_release(__first, __n);
   return __first + idx;
 }
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
 __pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+  // If it is safe to offload the computations to the GPU, we call the OpenMP
+  // implementation of find_if
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
     return std::__rewrap_iter(__first, std::__omp_find_if(std::__unwrap_iter(__first), __last - __first, __pred));
+    // Else we rey on the CPU PSTL backend
   } else {
     return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index eb8caf23971cb33..f64efc170537af6 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -26,16 +26,25 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _Tp, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_for_each(_Tp* __inout1, _DifferenceType __n, _Function __f) noexcept {
+  __par_backend::__omp_map_to(__inout1, __n);
+#  pragma omp target teams distribute parallel for simd
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(*(__inout1 + __i));
+  __par_backend::__omp_map_from(__inout1, __n);
+  return __inout1 + __n;
+}
+
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
-  // It is only safe to execute for_each on the GPU, it the execution policy is
-  // parallel unsequenced, as it is the only execution policy prohibiting throwing
-  // exceptions and allowing SIMD instructions
+  // If it is safe to offload the computations to the GPU, we call the OpenMP
+  // implementation of for_each
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
-    __par_backend::__omp_for_each(std::__unwrap_iter(__first), __last - __first, __func);
+    std::__omp_for_each(std::__unwrap_iter(__first), __last - __first, __func);
     return __empty{};
   }
   // Else we fall back to the serial backend
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 2e44af3c9a761f5..88ee4cdbbc4faec 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -23,10 +23,6 @@
 #include <__utility/move.h>
 #include <cstddef>
 
-// is_same
-
-// __libcpp_is_contiguous_iterator
-
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
@@ -70,26 +66,11 @@ __omp_map_alloc([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _Diff
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
-__omp_map_free([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+__omp_map_release([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
 #  pragma omp target exit data map(release : p[0 : len])
 }
 
-//===----------------------------------------------------------------------===//
-// The OpenMP implementation of for_each is shared between for_each and fill
-//===----------------------------------------------------------------------===//
-
-template <class _Tp, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Tp*
-__omp_for_each(_Tp* __first, _DifferenceType __n, _Function __f) noexcept {
-  __par_backend::__omp_map_to(__first, __n);
-#  pragma omp target teams distribute parallel for simd
-  for (_DifferenceType __i = 0; __i < __n; ++__i)
-    __f(*(__first + __i));
-  __par_backend::__omp_map_from(__first, __n);
-  return __first + __n;
-}
-
 } // namespace __omp_gpu_backend
 } // namespace __par_backend
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index f7e52d7933b6c40..bd795d5571f0d79 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -33,29 +33,45 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_transform(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Function __f) noexcept {
-  __par_backend::__omp_map_alloc(__first2, __n);
-  __par_backend::__omp_map_to(__first1, __n);
+__omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
+  // The order of the following maps matter, as we wish to move the data. If
+  // they were placed in the reverse order, and __in equals __out, then we would
+  // allocate the buffer on the device without copying the data.
+  __par_backend::__omp_map_to(__in1, __n);
+  __par_backend::__omp_map_alloc(__out1, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first2 + __i) = __f(*(__first1 + __i));
-  __par_backend::__omp_map_from(__first2, __n);
-  __par_backend::__omp_map_free(__first1, __n);
+    *(__out1 + __i) = __f(*(__in1 + __i));
+  // The order of the following two maps matters, since the user could legally
+  // overwrite __in The "release" map modifier decreases the reference counter
+  // by one, and "from" only moves the data to the host, when the reference
+  // count is decremented to zero.
+  __par_backend::__omp_map_release(__in1, __n);
+  __par_backend::__omp_map_from(__out1, __n);
   return __empty{};
 }
 
 template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__omp_transform(_Tp* __first1, _DifferenceType __n, _Up* __first2, _Vp* __first3, _Function __f) noexcept {
-  __par_backend::__omp_map_to(__first1, __n);
-  __par_backend::__omp_map_to(__first2, __n);
-  __par_backend::__omp_map_alloc(__first3, __n);
+__omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Function __f) noexcept {
+  // The order of the following maps matter, as we wish to move the data. If
+  // they were placed in the reverse order, and __out equals __in1 or __in2,
+  // then we would allocate one of the buffer on the device without copying the
+  // data.
+  __par_backend::__omp_map_to(__in1, __n);
+  __par_backend::__omp_map_to(__in2, __n);
+  __par_backend::__omp_map_alloc(__out1, __n);
 #  pragma omp target teams distribute parallel for simd
   for (_DifferenceType __i = 0; __i < __n; ++__i)
-    *(__first3 + __i) = __f(*(__first1 + __i), *(__first2 + __i));
-  __par_backend::__omp_map_free(__first1, __n);
-  __par_backend::__omp_map_free(__first2, __n);
-  __par_backend::__omp_map_from(__first3, __n);
+    *(__out1 + __i) = __f(*(__in1 + __i), *(__in2 + __i));
+  // The order of the following three maps matters, since the user could legally
+  // overwrite either of the inputs if __out equals __in1 or __in2. The
+  // "release" map modifier decreases the reference counter by one, and "from"
+  // only moves the data from the device, when the reference count is
+  // decremented to zero.
+  __par_backend::__omp_map_release(__in1, __n);
+  __par_backend::__omp_map_release(__in2, __n);
+  __par_backend::__omp_map_from(__out1, __n);
   return __empty{};
 }
 
@@ -95,7 +111,12 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__omp_transform(std::__unwrap_iter(__first1), __last1 - __first1, std::__unwrap_iter(__first2), std::__unwrap_iter(__result), __op);
+    std::__omp_transform(
+        std::__unwrap_iter(__first1),
+        __last1 - __first1,
+        std::__unwrap_iter(__first2),
+        std::__unwrap_iter(__result),
+        __op);
     return __result + (__last1 - __first1);
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index 3798e1cb3f3fac1..1684c3d273b2239 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -39,17 +39,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
               typename _Tp,                                                                                            \
               typename _BinaryOperationType,                                                                           \
               typename _UnaryOperation>                                                                                \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                     \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                                  \
         _Iterator __first,                                                                                             \
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
         _UnaryOperation __transform) noexcept {                                                                        \
-      __par_backend::__omp_map_to(__first, __n);                                                                   \
+      __par_backend::__omp_map_to(__first, __n);                                                                       \
 _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
-      __par_backend::__omp_map_free(__first, __n);                                                                 \
+      __par_backend::__omp_map_release(__first, __n);                                                                  \
       return __init;                                                                                                   \
     }
 
@@ -60,20 +60,20 @@ _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__in
               typename _Tp,                                                                                            \
               typename _BinaryOperationType,                                                                           \
               typename _UnaryOperation >                                                                               \
-    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                     \
+    _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce(                                                                  \
         _Iterator1 __first1,                                                                                           \
         _Iterator2 __first2,                                                                                           \
         _DifferenceType __n,                                                                                           \
         _Tp __init,                                                                                                    \
         std_op<_BinaryOperationType> __reduce,                                                                         \
         _UnaryOperation __transform) noexcept {                                                                        \
-      __par_backend::__omp_map_to(__first1, __n);                                                                  \
-      __par_backend::__omp_map_to(__first2, __n);                                                                  \
+      __par_backend::__omp_map_to(__first1, __n);                                                                      \
+      __par_backend::__omp_map_to(__first2, __n);                                                                      \
 _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
-      __par_backend::__omp_map_free(__first1, __n);                                                                \
-      __par_backend::__omp_map_free(__first2, __n);                                                                \
+      __par_backend::__omp_map_release(__first1, __n);                                                                 \
+      __par_backend::__omp_map_release(__first2, __n);                                                                 \
       return __init;                                                                                                   \
     }
 
@@ -145,8 +145,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
                 __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
-    return std::__omp_transform_reduce(
-        std::__unwrap_iter(__first), __last - __first, __init, __reduce, __transform);
+    return std::__omp_transform_reduce(std::__unwrap_iter(__first), __last - __first, __init, __reduce, __transform);
   }
   return std::__pstl_transform_reduce<_ExecutionPolicy>(
       __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
@@ -166,7 +165,6 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     _Tp __init,
     _BinaryOperation1 __reduce,
     _BinaryOperation2 __transform) {
-  // The interface for the function switched between C++17 and C++20
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&

>From 79bd44748f194c22c4b4674e69ea7a1d1673e2fc Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 11:42:13 -0700
Subject: [PATCH 20/46] Adding compile time errors to
 pstl_backend/openmp/backend.h to validate that OpenMP is enabled

---
 libcxx/docs/UsingLibcxx.rst                               | 1 +
 libcxx/include/__algorithm/pstl_backends/openmp/backend.h | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index 9ef4814d00254f4..c9726ef4f604018 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -525,6 +525,7 @@ is necessary when capturing y reference.
 
 Compiling functions for GPUs with OpenMP
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 The C++ standard defines that all accesses to memory are inside a single address
 space. However, discrete GPU systems have distinct address spaces. A single
 address space can be emulated if your system supports unified shared memory.
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index 99245a7d13e9917..11ab0bb108b2251 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -22,10 +22,9 @@
 
 #  if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 #    if !defined(_OPENMP)
-// #   warning "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+#   error "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
 #    elif (defined(_OPENMP) && _OPENMP < 201511)
-// #   warning "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent
-// version of OpenMP."
+#   error "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
 #    endif
 #  endif
 

>From 9ec0744470cf3c0d380af8aaabe84129780b8868 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 11:58:27 -0700
Subject: [PATCH 21/46] Clang-formatted error message in backend.h

---
 libcxx/include/__algorithm/pstl_backends/openmp/backend.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index 11ab0bb108b2251..eb5e40d6f20a94d 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -22,9 +22,10 @@
 
 #  if defined(_LIBCPP_PSTL_BACKEND_OPENMP)
 #    if !defined(_OPENMP)
-#   error "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
+#      error "PSTL is configured to use the OpenMP backend, but OpenMP is not enabled. Did you compile with -fopenmp?"
 #    elif (defined(_OPENMP) && _OPENMP < 201511)
-#   error "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
+#      error                                                                                                           \
+          "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."
 #    endif
 #  endif
 

>From bcd473205a3528c98cf4b679d69c21aec5cd8378 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 12:48:16 -0700
Subject: [PATCH 22/46] Disabling ADL in pstl_backends/openmp/stable_sort.h

---
 libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
index a4c6a2bff9f92fd..0b5ce39a2344a9a 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/stable_sort.h
@@ -28,7 +28,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_stable_sort(__omp_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   // TODO: Implement GPU backend.
-  return __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+  return std::__pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
 }
 
 _LIBCPP_END_NAMESPACE_STD

>From 827f266f6a42b6e7385ba42032beca5c5f634287 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 12:54:28 -0700
Subject: [PATCH 23/46] Added std::__rewrap_iter to std::transform

---
 .../__algorithm/pstl_backends/openmp/transform.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index bd795d5571f0d79..0338771728d9d0a 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -32,7 +32,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
+_LIBCPP_HIDE_FROM_ABI _Tp*
 __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
   // The order of the following maps matter, as we wish to move the data. If
   // they were placed in the reverse order, and __in equals __out, then we would
@@ -48,11 +48,11 @@ __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noe
   // count is decremented to zero.
   __par_backend::__omp_map_release(__in1, __n);
   __par_backend::__omp_map_from(__out1, __n);
-  return __empty{};
+  return __out1 + __n;
 }
 
 template <class _Tp, class _DifferenceType, class _Up, class _Vp, class _Function>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
+_LIBCPP_HIDE_FROM_ABI _Tp*
 __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Function __f) noexcept {
   // The order of the following maps matter, as we wish to move the data. If
   // they were placed in the reverse order, and __out equals __in1 or __in2,
@@ -72,7 +72,7 @@ __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Funct
   __par_backend::__omp_map_release(__in1, __n);
   __par_backend::__omp_map_release(__in2, __n);
   __par_backend::__omp_map_from(__out1, __n);
-  return __empty{};
+  return __out1 + __n;
 }
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
@@ -86,8 +86,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op);
-    return __result + (__last - __first);
+    std::__rewrap_iter(__result,std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op));
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
@@ -111,13 +110,12 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__omp_transform(
+    return std::__rewrap_iter(__result,std::__omp_transform(
         std::__unwrap_iter(__first1),
         __last1 - __first1,
         std::__unwrap_iter(__first2),
         std::__unwrap_iter(__result),
-        __op);
-    return __result + (__last1 - __first1);
+        __op));
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);

>From 0432677728de17fe317ffcad0519d56b18016b79 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 13:17:56 -0700
Subject: [PATCH 24/46] Clang-formatted
 libcxx/include/__algorithm/pstl_backends/openmp/transform.h

---
 .../pstl_backends/openmp/transform.h          | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index 0338771728d9d0a..e1c5f54675b3870 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -32,8 +32,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 //===----------------------------------------------------------------------===//
 
 template <class _Tp, class _DifferenceType, class _Up, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Tp*
-__omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
+_LIBCPP_HIDE_FROM_ABI _Tp* __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept {
   // The order of the following maps matter, as we wish to move the data. If
   // they were placed in the reverse order, and __in equals __out, then we would
   // allocate the buffer on the device without copying the data.
@@ -86,7 +85,9 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    std::__rewrap_iter(__result,std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op));
+    std::__rewrap_iter(
+        __result,
+        std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op));
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
@@ -110,12 +111,14 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
-    return std::__rewrap_iter(__result,std::__omp_transform(
-        std::__unwrap_iter(__first1),
-        __last1 - __first1,
-        std::__unwrap_iter(__first2),
-        std::__unwrap_iter(__result),
-        __op));
+    return std::__rewrap_iter(
+        __result,
+        std::__omp_transform(
+            std::__unwrap_iter(__first1),
+            __last1 - __first1,
+            std::__unwrap_iter(__first2),
+            std::__unwrap_iter(__result),
+            __op));
   }
   // If it is not safe to offload to the GPU, we rely on the CPU backend.
   return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);

>From 8a2af8a533cb4e09997e2a3f35fc26849b4641df Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 20 Oct 2023 14:40:03 -0700
Subject: [PATCH 25/46] Fixing errors detected by C++26 buildbot

---
 .../__algorithm/pstl_backends/openmp/find_if.h   |  8 ++++----
 .../pstl_backends/openmp/omp_offload.h           | 16 ++++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 8d7b196ec176f71..b86c6cbebf2201b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -27,15 +27,15 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp, class _DifferenceType, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI _Tp* __omp_find_if(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
   __par_backend::__omp_map_to(__first, __n);
-  _DifferenceType idx = __n;
-#  pragma omp target teams distribute parallel for simd reduction(min : idx)
+  _DifferenceType __idx = __n;
+#  pragma omp target teams distribute parallel for simd reduction(min : __idx)
   for (_DifferenceType __i = 0; __i < __n; ++__i) {
     if (__pred(*(__first + __i))) {
-      idx = (__i < idx) ? __i : idx;
+      __idx = (__i < __idx) ? __i : __idx;
     }
   }
   __par_backend::__omp_map_release(__first, __n);
-  return __first + idx;
+  return __first + __idx;
 }
 
 template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 88ee4cdbbc4faec..99ac28a7a33ead3 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -45,30 +45,30 @@ inline namespace __omp_gpu_backend {
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
-__omp_map_to([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+__omp_map_to([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(to : p[0 : len])
+#  pragma omp target enter data map(to : __p [0:__len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
-__omp_map_from([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+__omp_map_from([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(from : p[0 : len])
+#  pragma omp target exit data map(from : __p [0:__len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
-__omp_map_alloc([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+__omp_map_alloc([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(alloc : p[0 : len])
+#  pragma omp target enter data map(alloc : __p [0:__len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
-__omp_map_release([[maybe_unused]] const _Iterator p, [[maybe_unused]] const _DifferenceType len) noexcept {
+__omp_map_release([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(release : p[0 : len])
+#  pragma omp target exit data map(release : __p [0:__len])
 }
 
 } // namespace __omp_gpu_backend

>From 1c4773e91d75dfc6f5ca008639ea6d0f68117d33 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 23 Oct 2023 11:05:33 -0700
Subject: [PATCH 26/46] Updating includes once more after refactorings

---
 .../__algorithm/pstl_backends/openmp.h        | 35 +++++++++++++++++++
 .../__algorithm/pstl_backends/openmp/fill.h   |  1 +
 .../pstl_backends/openmp/find_if.h            |  1 +
 .../pstl_backends/openmp/for_each.h           |  1 +
 .../pstl_backends/openmp/omp_offload.h        | 27 ++++----------
 .../pstl_backends/openmp/transform.h          |  1 +
 .../pstl_backends/openmp/transform_reduce.h   |  2 ++
 7 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index 9e5490db655f4bd..a1d015e056837b9 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -11,6 +11,41 @@
 
 #include <__config>
 
+/*
+Combined OpenMP CPU and GPU Backend
+===================================
+Contrary to the CPU backends found in ./cpu_backends/, the OpenMP backend can
+target both CPUs and GPUs. The OpenMP standard defines that when offloading code
+to an accelerator, the compiler must generate a fallback code for execution on
+the host. Thereby, the backend works as a CPU backend if no targeted accelerator
+is available at execution time. The target regions can also be compiled directly
+for a CPU architecture, for instance by adding the command-line option
+`-fopenmp-targets=x86_64-pc-linux-gnu` in Clang.
+
+Implicit Assumptions
+--------------------
+If the user provides a function pointer as an argument to a parallel algorithm,
+it is assumed that it is the device pointer as there is currently no way to
+check whether a host or device pointer was passed.
+
+Mapping Clauses
+---------------
+In some of the parallel algorithms, the user is allowed to provide the same
+iterator as input and output. Hence, the order of the maps matters. Therefore,
+`pragma omp target data map(to:...)` must be used before
+`pragma omp target data map(alloc:...)`. Conversely, the maps with map modifier
+`release` must be placed before the maps with map modifier `from` when
+transferring the result from the device to the host.
+
+Exceptions
+----------
+Currently, GPU architectures do not handle exceptions. OpenMP target regions are
+allowed to contain try/catch statements and throw expressions in Clang, but if a
+throw expression is reached, it will terminate the program. That does not
+conform with the C++ standard.
+
+*/
+
 #include <__algorithm/pstl_backends/openmp/backend.h>
 
 #include <__algorithm/pstl_backends/openmp/any_of.h>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 4262f778fa9121c..53de1284acc60b8 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -12,6 +12,7 @@
 #include <__algorithm/fill.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index b86c6cbebf2201b..db128e058823db7 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -12,6 +12,7 @@
 #include <__algorithm/find_if.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <optional>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index f64efc170537af6..ef73b864773fc10 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -12,6 +12,7 @@
 #include <__algorithm/for_each.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index 99ac28a7a33ead3..d47fdcd39c39b25 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -9,33 +9,20 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
 
-#include <__algorithm/unwrap_iter.h>
 #include <__assert>
 #include <__config>
-#include <__functional/operations.h>
 #include <__iterator/iterator_traits.h>
-#include <__iterator/wrap_iter.h>
-#include <__memory/addressof.h>
-#include <__memory/pointer_traits.h>
-#include <__type_traits/is_pointer.h>
-#include <__type_traits/is_same.h>
-#include <__utility/empty.h>
-#include <__utility/move.h>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
-_LIBCPP_PUSH_MACROS
-#include <__undef_macros>
-
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace __par_backend {
-inline namespace __omp_gpu_backend {
+inline namespace __omp_backend {
 
 //===----------------------------------------------------------------------===//
 // The following four functions can be used to map contiguous array sections to
@@ -47,37 +34,35 @@ template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_to([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(to : __p [0:__len])
+#  pragma omp target enter data map(to : __p[0 : __len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_from([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(from : __p [0:__len])
+#  pragma omp target exit data map(from : __p[0 : __len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_alloc([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(alloc : __p [0:__len])
+#  pragma omp target enter data map(alloc : __p[0 : __len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_release([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(release : __p [0:__len])
+#  pragma omp target exit data map(release : __p[0 : __len])
 }
 
-} // namespace __omp_gpu_backend
+} // namespace __omp_backend
 } // namespace __par_backend
 
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17
 
-_LIBCPP_POP_MACROS
-
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_OMP_OFFLOAD_H
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index e1c5f54675b3870..eec6fb6fed82070 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -12,6 +12,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/transform.h>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__iterator/wrap_iter.h>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index 1684c3d273b2239..a03dafabc8d3870 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -11,7 +11,9 @@
 
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
+#include <__functional/operations.h>
 #include <__iterator/concepts.h>
 #include <__iterator/wrap_iter.h>
 #include <__numeric/transform_reduce.h>

>From 4abc8b3217fe3ac14d29cdb4891cbd2d86276ae8 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 23 Oct 2023 12:29:08 -0700
Subject: [PATCH 27/46] Adding more comments and removing more includes

---
 .../__algorithm/pstl_backends/openmp.h        | 21 +++++++++++++++++++
 .../__algorithm/pstl_backends/openmp/any_of.h |  1 -
 .../pstl_backends/openmp/backend.h            |  1 -
 .../__algorithm/pstl_backends/openmp/fill.h   |  1 -
 .../pstl_backends/openmp/for_each.h           |  1 -
 .../pstl_backends/openmp/transform.h          |  1 -
 .../pstl_backends/openmp/transform_reduce.h   |  2 --
 7 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index a1d015e056837b9..a3b52456dc6a609 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -22,6 +22,24 @@ is available at execution time. The target regions can also be compiled directly
 for a CPU architecture, for instance by adding the command-line option
 `-fopenmp-targets=x86_64-pc-linux-gnu` in Clang.
 
+When is an Algorithm Offloaded?
+-------------------------------
+Only parallel algorithms with the parallel unsequenced execution policy are
+offloaded to the device. We cannot offload parallel algorithms with a parallel
+execution policy to GPUs because invocations executing in the same thread "are
+indeterminately sequenced with respect to each other" which we cannot guarantee
+on a GPU.
+
+The standard draft states that "the semantics [...] allow the implementation to
+fall back to sequential execution if the system cannot parallelize an algorithm
+invocation". If it is not deemed safe to offload the parallel algorithm to the
+device, we first fall back to a parallel unsequenced implementation from
+./cpu_backends. The CPU implementation may then fall back to sequential
+execution. In that way we strive to achieve the best possible performance.
+
+Further, "it is the caller's responsibility to ensure that the invocation does
+not introduce data races or deadlocks."
+
 Implicit Assumptions
 --------------------
 If the user provides a function pointer as an argument to a parallel algorithm,
@@ -44,6 +62,9 @@ allowed to contain try/catch statements and throw expressions in Clang, but if a
 throw expression is reached, it will terminate the program. That does not
 conform with the C++ standard.
 
+[This document](https://eel.is/c++draft/algorithms.parallel) has been used as
+reference for these considerations.
+
 */
 
 #include <__algorithm/pstl_backends/openmp/backend.h>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
index 65f2294ff2ee5fe..ec5b4c4a4c3aa25 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/any_of.h
@@ -13,7 +13,6 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__config>
-#include <__type_traits/is_execution_policy.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
index eb5e40d6f20a94d..401bb2a770379e9 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/backend.h
@@ -10,7 +10,6 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_OPENMP_BACKEND_BACKEND_H
 
 #include <__config>
-#include <cstddef>
 
 #include <__algorithm/pstl_backends/openmp/omp_offload.h>
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 53de1284acc60b8..97e80c5ca853b27 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -17,7 +17,6 @@
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_pointer.h>
-#include <__utility/empty.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index ef73b864773fc10..e910d8dcc1d394b 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -16,7 +16,6 @@
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__utility/empty.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index eec6fb6fed82070..f8aca11fc2102fb 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -14,7 +14,6 @@
 #include <__algorithm/transform.h>
 #include <__algorithm/unwrap_iter.h>
 #include <__config>
-#include <__iterator/concepts.h>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
 #include <optional>
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index a03dafabc8d3870..9e83a91d10caded 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -14,10 +14,8 @@
 #include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__functional/operations.h>
-#include <__iterator/concepts.h>
 #include <__iterator/wrap_iter.h>
 #include <__numeric/transform_reduce.h>
-#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/operation_traits.h>

>From 1cd48e409a1fd47380adcb398710a1e4e109cfeb Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 23 Oct 2023 12:49:36 -0700
Subject: [PATCH 28/46] Fix Apple build bot

---
 libcxx/cmake/caches/Apple.cmake | 2 +-
 libcxx/src/CMakeLists.txt       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/cmake/caches/Apple.cmake b/libcxx/cmake/caches/Apple.cmake
index 804eccd3a5dc5e9..e1aea0313cb9f16 100644
--- a/libcxx/cmake/caches/Apple.cmake
+++ b/libcxx/cmake/caches/Apple.cmake
@@ -7,7 +7,7 @@ set(LIBCXX_ENABLE_STATIC ON CACHE BOOL "")
 set(LIBCXX_ENABLE_SHARED ON CACHE BOOL "")
 set(LIBCXX_CXX_ABI libcxxabi CACHE STRING "")
 set(LIBCXX_ENABLE_VENDOR_AVAILABILITY_ANNOTATIONS ON CACHE BOOL "")
-set(LIBCXX_PSTL_CPU_BACKEND libdispatch CACHE STRING "")
+set(LIBCXX_PSTL_BACKEND libdispatch CACHE STRING "")
 
 set(LIBCXX_HERMETIC_STATIC_LIBRARY ON CACHE BOOL "")
 set(LIBCXXABI_HERMETIC_STATIC_LIBRARY ON CACHE BOOL "")
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index e57fbf1468acb2b..db3bcbaae425a56 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -320,7 +320,7 @@ set(LIBCXX_EXPERIMENTAL_SOURCES
   experimental/memory_resource.cpp
   )
 
-if (LIBCXX_PSTL_CPU_BACKEND STREQUAL "libdispatch")
+if (LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   list(APPEND LIBCXX_EXPERIMENTAL_SOURCES
     pstl/libdispatch.cpp
     )

>From 2babeaba8f08a812f4ca78b9779979e718969fdf Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 23 Oct 2023 14:30:10 -0700
Subject: [PATCH 29/46] Including empty.h in for_each and fill

---
 libcxx/include/__algorithm/pstl_backends/openmp/fill.h     | 1 +
 libcxx/include/__algorithm/pstl_backends/openmp/for_each.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 97e80c5ca853b27..53de1284acc60b8 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -17,6 +17,7 @@
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_pointer.h>
+#include <__utility/empty.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index e910d8dcc1d394b..ef73b864773fc10 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -16,6 +16,7 @@
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
+#include <__utility/empty.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)

>From b1ad9cc6d0f4a1dc36eaec5b75b587be14fa2c0d Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 25 Oct 2023 22:51:03 -0700
Subject: [PATCH 30/46] Adding support for OpenMP compilation of LIT tests

---
 libcxx/CMakeLists.txt                                 |  2 ++
 .../include/__algorithm/pstl_backends/openmp/fill.h   |  5 ++++-
 .../__algorithm/pstl_backends/openmp/find_if.h        |  5 ++++-
 .../__algorithm/pstl_backends/openmp/for_each.h       |  5 ++++-
 .../__algorithm/pstl_backends/openmp/transform.h      | 11 +++++++++--
 .../pstl_backends/openmp/transform_reduce.h           |  9 +++++++--
 libcxx/test/CMakeLists.txt                            |  8 ++++++++
 libcxx/utils/libcxx/test/params.py                    |  8 ++++++++
 libcxxabi/CMakeLists.txt                              |  8 ++++++++
 9 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 0fdeab59e89f767..71ba749fd120dd5 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -782,6 +782,8 @@ elseif (LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
   config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
+  # Making sure that OpenMP is enabled during build
+  add_compile_options(-fopenmp)
 else()
   message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
                        Valid backends are: serial, std-thread, libdispatch, and openmp.")
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 53de1284acc60b8..7a9c379491a5f21 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -16,6 +16,7 @@
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
+#include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/remove_pointer.h>
 #include <__utility/empty.h>
 #include <optional>
@@ -46,7 +47,9 @@ __pstl_fill(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __last
   // lambda returning a constant.
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first))> > &&
+                is_trivially_copyable_v<_Tp>) {
     std::__rewrap_iter(__first, std::__omp_fill(std::__unwrap_iter(__first), __last - __first, __value));
     return __empty{};
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index db128e058823db7..609638ebb3d2505 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -15,6 +15,8 @@
 #include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__iterator/wrap_iter.h>
+#include <__type_traits/is_trivially_copyable.h>
+#include <__type_traits/remove_pointer.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -46,7 +48,8 @@ __pstl_find_if(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __l
   // implementation of find_if
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first))> >) {
     return std::__rewrap_iter(__first, std::__omp_find_if(std::__unwrap_iter(__first), __last - __first, __pred));
     // Else we rey on the CPU PSTL backend
   } else {
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index ef73b864773fc10..e29255a4892ee41 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -16,6 +16,8 @@
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
+#include <__type_traits/is_trivially_copyable.h>
+#include <__type_traits/remove_pointer.h>
 #include <__utility/empty.h>
 #include <optional>
 
@@ -44,7 +46,8 @@ __pstl_for_each(__omp_backend_tag, _ForwardIterator __first, _ForwardIterator __
   // implementation of for_each
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
+                __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first))> >) {
     std::__omp_for_each(std::__unwrap_iter(__first), __last - __first, __func);
     return __empty{};
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index f8aca11fc2102fb..3353f74ed9166da 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -16,6 +16,8 @@
 #include <__config>
 #include <__iterator/wrap_iter.h>
 #include <__type_traits/is_execution_policy.h>
+#include <__type_traits/is_trivially_copyable.h>
+#include <__type_traits/remove_pointer.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -84,7 +86,9 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
+                __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first))> > &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__result))> >) {
     std::__rewrap_iter(
         __result,
         std::__omp_transform(std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__result), __op));
@@ -110,7 +114,10 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
-                __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
+                __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first1))> > &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first2))> > &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__result))> >) {
     return std::__rewrap_iter(
         __result,
         std::__omp_transform(
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index 9e83a91d10caded..6a074fb6e4fa9be 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -18,7 +18,9 @@
 #include <__numeric/transform_reduce.h>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
+#include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/operation_traits.h>
+#include <__type_traits/remove_pointer.h>
 #include <optional>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -144,7 +146,8 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
-                __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value) {
+                __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first))> >) {
     return std::__omp_transform_reduce(std::__unwrap_iter(__first), __last - __first, __init, __reduce, __transform);
   }
   return std::__pstl_transform_reduce<_ExecutionPolicy>(
@@ -169,7 +172,9 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
                 __is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
                 __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
-                __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value) {
+                __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first1))> > &&
+                is_trivially_copyable_v<remove_pointer_t<decltype(std::__unwrap_iter(__first2))> >) {
     return std::__omp_transform_reduce(
         std::__unwrap_iter(__first1), std::__unwrap_iter(__first2), __last1 - __first1, __init, __reduce, __transform);
   }
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index 48dd233462ab3b7..ae468498c9c8761 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -35,6 +35,14 @@ if (LLVM_USE_SANITIZER)
   serialize_lit_string_param(SERIALIZED_LIT_PARAMS use_sanitizer "${LLVM_USE_SANITIZER}")
 endif()
 
+# If the OpenMP PSTL backend was enabled, the OpenMP compilation toolchain must
+# also be enabled for the LIT tests
+if (DEFINED LIBCXX_PSTL_BACKEND)
+  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+    serialize_lit_string_param(SERIALIZED_LIT_PARAMS enable_openmp "ON")
+  endif()
+endif()
+
 serialize_lit_params_list(SERIALIZED_LIT_PARAMS LIBCXX_TEST_PARAMS)
 
 if (NOT DEFINED LIBCXX_TEST_DEPS)
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index 2ead641354e585d..5a2bdfff8a9bf5e 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -330,6 +330,14 @@ def getStdFlag(cfg, std):
         default=f"{shlex.quote(sys.executable)} {shlex.quote(str(Path(__file__).resolve().parent.parent.parent / 'run.py'))}",
         help="Custom executor to use instead of the configured default.",
         actions=lambda executor: [AddSubstitution("%{executor}", executor)],
+    ),
+    Parameter(
+        name="enable_openmp",
+        choices=[True, False],
+        type=bool,
+        default=False,
+        help="Enable the OpenMP compilation toolchain if the PSTL backend was set to OpenMP.",
+        actions=lambda enabled: [AddCompileFlag("-fopenmp")] if enabled else [],
     )
 ]
 # fmt: on
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index 6fd4f02c750f5bb..8078dffd507644d 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -414,6 +414,14 @@ if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
   add_definitions("-D_XOPEN_SOURCE=700")
 endif()
 
+# If the OpenMP PSTL backend has been enabled for libcxx, OpenMP must be
+# enabled during compilation
+if (DEFINED LIBCXX_PSTL_BACKEND)
+  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+    add_compile_options(-fopenmp)
+  endif()
+endif()
+
 #===============================================================================
 # Setup Source Code
 #===============================================================================

>From c14840883c07ab81779e2ed8c929e1f4c933aa0a Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 27 Oct 2023 17:55:10 -0700
Subject: [PATCH 31/46] Adding tests for the OpenMP PSTL backend

---
 libcxx/test/CMakeLists.txt                    |  2 +-
 .../alg.pstl.offload/fill_offload.pass.cpp    | 41 ++++++++++++
 .../alg.pstl.offload/find_if.pass.cpp         | 67 +++++++++++++++++++
 .../alg.pstl.offload/find_if_offload.pass.cpp | 41 ++++++++++++
 .../for_each_offload.pass.cpp                 | 41 ++++++++++++
 .../for_each_overwrite_input.pass.cpp         | 64 ++++++++++++++++++
 .../openmp_version_40.verify.cpp              | 21 ++++++
 .../openmp_version_45.verify.cpp              | 21 ++++++
 .../openmp_version_51.verify.cpp              | 21 ++++++
 .../transform_offload.pass.cpp                | 63 +++++++++++++++++
 .../transform_reduce_offload.pass.cpp         | 45 +++++++++++++
 libcxx/utils/libcxx/test/params.py            | 13 +++-
 12 files changed, 437 insertions(+), 3 deletions(-)
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp

diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index ae468498c9c8761..34b5ee3f5551e2f 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -39,7 +39,7 @@ endif()
 # also be enabled for the LIT tests
 if (DEFINED LIBCXX_PSTL_BACKEND)
   if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
-    serialize_lit_string_param(SERIALIZED_LIT_PARAMS enable_openmp "ON")
+    serialize_lit_string_param(SERIALIZED_LIT_PARAMS openmp_pstl_backend "ON")
   endif()
 endif()
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
new file mode 100644
index 000000000000000..bd143b632258476
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but for_each is not executed on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(void) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int __test_size = 10000;
+  std::vector<int> __v(__test_size);
+  std::for_each(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) {
+    // Returns true if executed on the host
+    n = omp_is_initial_device();
+  });
+
+  auto __idx = std::find_if(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) -> bool { return n > 0; });
+  assert(__idx == __v.end() &&
+         "omp_is_initial_device() returned true in the target region. std::for_each was not offloaded.");
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
new file mode 100644
index 000000000000000..6a61887a443d021
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that you can overwrite the input in std::for_each. If the
+// result was not copied back from the device to the host, this test would fail.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <execution>
+#include <vector>
+
+template <class _Tp>
+void check_find_if(_Tp& __data) {
+  const int __len = __data.end() - __data.begin();
+  // Setting all elements to two except for the indexes in __idx
+  int __idx[11] = {
+      0, __len / 10, __len / 9, __len / 8, __len / 7, __len / 6, __len / 5, __len / 4, __len / 3, __len / 2, __len - 1};
+  std::fill(std::execution::par_unseq, __data.begin(), __data.end(), 2);
+  for (auto __i : __idx) {
+    __data[__i]--;
+  };
+  // Asserting that the minimas are found in the correct order
+  for (auto __i : __idx) {
+    auto __found_min = std::find_if(
+        std::execution::par_unseq, __data.begin(), __data.end(), [&](decltype(__data[0])& n) -> bool { return n < 2; });
+    assert(__found_min == (__data.begin() + __i));
+    // Incrementing the minimum, so the next one can be found
+    (*__found_min)++;
+  }
+}
+
+int main(void) {
+  const int __test_size = 10000;
+  // Testing with vector of doubles
+  {
+    std::vector<double> __v(__test_size);
+    check_find_if(__v);
+  }
+  // Testing with vector of integers
+  {
+    std::vector<int> __v(__test_size);
+    check_find_if(__v);
+  }
+  // Testing with array of doubles
+  {
+    std::array<double, __test_size> __a;
+    check_find_if(__a);
+  }
+  // Testing with array of integers
+  {
+    std::array<int, __test_size> __a;
+    check_find_if(__a);
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
new file mode 100644
index 000000000000000..4e91f52673c547e
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but find_if is not executed on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(void) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int __test_size = 10000;
+  std::vector<double> __v(__test_size);
+  std::fill(std::execution::par_unseq, __v.begin(), __v.end(), 1.0);
+
+  auto __idx = std::find_if(std::execution::par_unseq, __v.begin(), __v.end(), [](double&) -> bool {
+    // Returns true if executed on the host
+    return omp_is_initial_device();
+  });
+  assert(__idx == __v.end() &&
+         "omp_is_initial_device() returned true in the target region. std::find_if was not offloaded.");
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
new file mode 100644
index 000000000000000..bd143b632258476
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but for_each is not executed on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(void) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int __test_size = 10000;
+  std::vector<int> __v(__test_size);
+  std::for_each(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) {
+    // Returns true if executed on the host
+    n = omp_is_initial_device();
+  });
+
+  auto __idx = std::find_if(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) -> bool { return n > 0; });
+  assert(__idx == __v.end() &&
+         "omp_is_initial_device() returned true in the target region. std::for_each was not offloaded.");
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
new file mode 100644
index 000000000000000..4f60b8dcc78838d
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that you can overwrite the input in std::for_each. If the
+// result was not copied back from the device to the host, this test would fail.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp --offload-arch=native
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <execution>
+#include <vector>
+
+template <class _Tp, class _Predicate, class _Up>
+void overwrite(_Tp& __data, _Predicate __pred, const _Up& __value) {
+  // This function assumes that __pred will never be the identity transformation
+  // Filling array with __value
+  std::fill(std::execution::par_unseq, __data.begin(), __data.end(), __value);
+
+  // Updating the array with a lambda
+  std::for_each(std::execution::par_unseq, __data.begin(), __data.end(), __pred);
+
+  // Asserting that no elements have the intial value
+  auto __idx = std::find_if(
+      std::execution::par_unseq, __data.begin(), __data.end(), [&, __value](decltype(__data[0])& n) -> bool {
+        return n == __value;
+      });
+  assert(__idx == __data.end());
+}
+
+int main(void) {
+  const int __test_size = 10000;
+  // Testing with vector of doubles
+  {
+    std::vector<double> __v(__test_size);
+    overwrite(__v, [&](double& __n) { __n *= __n; }, 2.0);
+  }
+  // Testing with vector of integers
+  {
+    std::vector<int> __v(__test_size);
+    overwrite(__v, [&](int& __n) { __n *= __n; }, 2);
+  }
+  // Testing with array of doubles
+  {
+    std::array<double, __test_size> __a;
+    overwrite(__a, [&](double& __n) { __n *= __n; }, 2.0);
+  }
+  // Testing with array of integers
+  {
+    std::array<int, __test_size> __a;
+    overwrite(__a, [&](int& __n) { __n *= __n; }, 2);
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp
new file mode 100644
index 000000000000000..ec6d567d67226f1
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that a diagnostic error is prompted if the OpenMP version is below
+// the minimum required version.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=40
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+
+// expected-error at __algorithm/pstl_backends/openmp/backend.h:26 {{"OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."}}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp
new file mode 100644
index 000000000000000..881f0ee16e0a86a
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that one can include algorithm without any diagnostics when using
+// the minimum required version of OpenMP.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=45
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+
+// expected-no-diagnostics
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp
new file mode 100644
index 000000000000000..d9a0242606bd169
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// OpenMP target offloading has only been supported since version 4.5. This test
+// verifies that one can include algorithm without any diagnostics when using a
+// version that is newer than the minimum requirement.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -fopenmp -fopenmp-version=51
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+
+// expected-no-diagnostics
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
new file mode 100644
index 000000000000000..755c1054b469e10
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but transform is not executed on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+
+int main(void) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int __test_size = 10000;
+  std::vector<int> __host(__test_size);
+  std::vector<int> __device(__test_size);
+  // Should execute on host
+  std::transform(std::execution::unseq, __host.begin(), __host.end(), __host.begin(), [](int& h) {
+    // Returns true if executed on the host
+    h = omp_is_initial_device();
+    return h;
+  });
+
+  // Finding first index where omp_is_initial_device() returned true
+  auto __idx = std::find_if(std::execution::par_unseq, __host.begin(), __host.end(), [](int& n) -> bool { return n; });
+  assert(__idx == __host.begin() &&
+         "omp_is_initial_device() returned false. std::transform was offloaded but shouldn't be.");
+
+  // Should execute on device
+  std::transform(
+      std::execution::par_unseq,
+      __device.begin(),
+      __device.end(),
+      __host.begin(),
+      __device.begin(),
+      [](int& d, int& h) {
+        // Should return fals
+        d = omp_is_initial_device();
+        return h == d;
+      });
+
+  // Finding first index where omp_is_initial_device() returned true
+  __idx = std::find_if(std::execution::par_unseq, __device.begin(), __device.end(), [](int& n) -> bool { return n; });
+  assert(__idx == __device.end() &&
+         "omp_is_initial_device() returned true in the target region. std::transform was not offloaded.");
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
new file mode 100644
index 000000000000000..072f0ffbaf9e6b6
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test will fail if the number of devices detected by OpenMP is larger
+// than zero but transform_reduce is not executed on the device.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <execution>
+#include <vector>
+#include <omp.h>
+#include <functional>
+
+int main(void) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int __test_size = 10000;
+  std::vector<int> __v(__test_size);
+  std::vector<int> __w(__test_size);
+  std::for_each(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) { n = !omp_is_initial_device(); });
+
+  std::for_each(std::execution::par_unseq, __w.begin(), __w.end(), [](int& n) { n = !omp_is_initial_device(); });
+
+  int result = std::transform_reduce(
+      std::execution::par_unseq, __v.begin(), __v.end(), __w.begin(), (int)0, std::plus{}, [](int& n, int& m) {
+        return n + m + omp_is_initial_device();
+      });
+  assert(result == 2 * __test_size &&
+         "omp_is_initial_device() returned true in the target region. std::transform_reduce was not offloaded.");
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index 5a2bdfff8a9bf5e..df6aa5aa0b7661c 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -332,12 +332,21 @@ def getStdFlag(cfg, std):
         actions=lambda executor: [AddSubstitution("%{executor}", executor)],
     ),
     Parameter(
-        name="enable_openmp",
+        name="openmp_pstl_backend",
         choices=[True, False],
         type=bool,
         default=False,
         help="Enable the OpenMP compilation toolchain if the PSTL backend was set to OpenMP.",
-        actions=lambda enabled: [AddCompileFlag("-fopenmp")] if enabled else [],
+        actions=lambda enabled: [
+            AddCompileFlag("-fopenmp"),
+            # The linker needs to find the correct version of libomptarget
+            AddLinkFlag("-Wl,-rpath,%{lib}/../../lib"),
+            AddLinkFlag("-L%{lib}/../../lib"),
+            #  The preprocessor needs to find the omp.h header
+            AddFlag("-I %{lib}/../../runtimes/runtimes-bins/openmp/runtime/src"),
+            # If the OpenMP PSTL backend was enbaled, we wish to run the tests for it
+            AddFeature("openmp_pstl_backend")
+        ] if enabled else [],
     )
 ]
 # fmt: on

>From e4471071835c56bf04aab7f5852365eddd3b1715 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 30 Oct 2023 23:56:02 -0700
Subject: [PATCH 32/46] Updating tests to avoid issues found in code review

---
 .../alg.pstl.offload/fill_offload.pass.cpp    |  37 ++--
 .../alg.pstl.offload/find_if.pass.cpp         |  52 ++---
 .../alg.pstl.offload/find_if_offload.pass.cpp |  14 +-
 .../for_each_offload.pass.cpp                 |  16 +-
 .../for_each_overwrite_input.pass.cpp         |  45 ++--
 .../openmp_version_40.verify.cpp              |   2 +-
 .../openmp_version_45.verify.cpp              |   2 +-
 .../openmp_version_51.verify.cpp              |   2 +-
 .../transform_offload.pass.cpp                |  36 ++--
 .../transform_reduce_offload.pass.cpp         |  22 +-
 ...educe_supported_binary_operations.pass.cpp | 201 ++++++++++++++++++
 11 files changed, 319 insertions(+), 110 deletions(-)
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
index bd143b632258476..1557a667b81e39c 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // This test will fail if the number of devices detected by OpenMP is larger
-// than zero but for_each is not executed on the device.
+// than zero but std::for_each(std::execution::par_unseq,...) is not executed on
+// the device.
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
@@ -27,15 +28,27 @@ int main(void) {
     return 0;
 
   // Initializing test array
-  const int __test_size = 10000;
-  std::vector<int> __v(__test_size);
-  std::for_each(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) {
-    // Returns true if executed on the host
-    n = omp_is_initial_device();
-  });
-
-  auto __idx = std::find_if(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) -> bool { return n > 0; });
-  assert(__idx == __v.end() &&
-         "omp_is_initial_device() returned true in the target region. std::for_each was not offloaded.");
+  const int test_size = 10000;
+  std::vector<int> v(test_size, 2);
+
+  // By making an extra map, we can control when the data is mapped to and from
+  // the device, because the map inside std::fill will then only increment and
+  // decrement reference counters and not move data.
+  int* data = v.data();
+#pragma omp target enter data map(to : data[0 : v.size()])
+  std::fill(std::execution::par_unseq, v.begin(), v.end(), -2);
+
+  // At this point v should only contain the value 2
+  for (int vi : v)
+    assert(vi == 2 &&
+           "std::fill transferred data from device to the host but should only have decreased the reference counter.");
+
+// After moving the result back to the host it should now be -2
+#pragma omp target update from(data[0 : v.size()])
+  for (int vi : v)
+    assert(vi == -2 && "std::fill did not update the result on the device.");
+
+#pragma omp target exit data map(delete : data[0 : v.size()])
+
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
index 6a61887a443d021..d368701f5712414 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This test verifies that you can overwrite the input in std::for_each. If the
-// result was not copied back from the device to the host, this test would fail.
+// This test verifies that std::find_if(std::execution::par_unseq,...) always
+// finds the first entry in a vector matching the condition. If it was confused
+// with std::any_of, it could return the indexes in a non-increasing order.
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
@@ -22,46 +23,47 @@
 #include <vector>
 
 template <class _Tp>
-void check_find_if(_Tp& __data) {
-  const int __len = __data.end() - __data.begin();
-  // Setting all elements to two except for the indexes in __idx
-  int __idx[11] = {
-      0, __len / 10, __len / 9, __len / 8, __len / 7, __len / 6, __len / 5, __len / 4, __len / 3, __len / 2, __len - 1};
-  std::fill(std::execution::par_unseq, __data.begin(), __data.end(), 2);
-  for (auto __i : __idx) {
-    __data[__i]--;
+void check_find_if(_Tp& data) {
+  const int len = data.end() - data.begin();
+  // Decrementing the values in the test indices
+  int idx[11] = {0, len / 10, len / 9, len / 8, len / 7, len / 6, len / 5, len / 4, len / 3, len / 2, len - 1};
+  for (auto i : idx) {
+    data[i] -= 1;
   };
+
   // Asserting that the minimas are found in the correct order
-  for (auto __i : __idx) {
-    auto __found_min = std::find_if(
-        std::execution::par_unseq, __data.begin(), __data.end(), [&](decltype(__data[0])& n) -> bool { return n < 2; });
-    assert(__found_min == (__data.begin() + __i));
+  for (auto i : idx) {
+    auto found_min = std::find_if(
+        std::execution::par_unseq, data.begin(), data.end(), [&](decltype(data[0])& n) -> bool { return n < 2; });
+    assert(found_min == (data.begin() + i));
     // Incrementing the minimum, so the next one can be found
-    (*__found_min)++;
+    (*found_min) += 1;
   }
 }
 
 int main(void) {
-  const int __test_size = 10000;
+  const int test_size = 10000;
   // Testing with vector of doubles
   {
-    std::vector<double> __v(__test_size);
-    check_find_if(__v);
+    std::vector<double> v(test_size, 2.0);
+    check_find_if(v);
   }
   // Testing with vector of integers
   {
-    std::vector<int> __v(__test_size);
-    check_find_if(__v);
+    std::vector<int> v(test_size, 2);
+    check_find_if(v);
   }
   // Testing with array of doubles
   {
-    std::array<double, __test_size> __a;
-    check_find_if(__a);
+    std::array<double, test_size> a;
+    a.fill(2.0);
+    check_find_if(a);
   }
   // Testing with array of integers
   {
-    std::array<int, __test_size> __a;
-    check_find_if(__a);
+    std::array<int, test_size> a;
+    a.fill(2);
+    check_find_if(a);
   }
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
index 4e91f52673c547e..89a884de80984ba 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // This test will fail if the number of devices detected by OpenMP is larger
-// than zero but find_if is not executed on the device.
+// than zero but syd::find_if(std::execution::par_unseq,...) is not executed on
+// the device.
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
@@ -27,15 +28,14 @@ int main(void) {
     return 0;
 
   // Initializing test array
-  const int __test_size = 10000;
-  std::vector<double> __v(__test_size);
-  std::fill(std::execution::par_unseq, __v.begin(), __v.end(), 1.0);
+  const int test_size = 10000;
+  std::vector<double> v(test_size, 1);
 
-  auto __idx = std::find_if(std::execution::par_unseq, __v.begin(), __v.end(), [](double&) -> bool {
+  auto idx = std::find_if(std::execution::par_unseq, v.begin(), v.end(), [](double&) -> bool {
     // Returns true if executed on the host
     return omp_is_initial_device();
   });
-  assert(__idx == __v.end() &&
+  assert(idx == v.end() &&
          "omp_is_initial_device() returned true in the target region. std::find_if was not offloaded.");
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
index bd143b632258476..6ee3310507c11db 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // This test will fail if the number of devices detected by OpenMP is larger
-// than zero but for_each is not executed on the device.
+// than zero but for_each(std::execution::par_unseq,...) is not executed on the
+// device.
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
@@ -27,15 +28,14 @@ int main(void) {
     return 0;
 
   // Initializing test array
-  const int __test_size = 10000;
-  std::vector<int> __v(__test_size);
-  std::for_each(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) {
+  const int test_size = 10000;
+  std::vector<int> v(test_size);
+  std::for_each(std::execution::par_unseq, v.begin(), v.end(), [](int& n) {
     // Returns true if executed on the host
     n = omp_is_initial_device();
   });
 
-  auto __idx = std::find_if(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) -> bool { return n > 0; });
-  assert(__idx == __v.end() &&
-         "omp_is_initial_device() returned true in the target region. std::for_each was not offloaded.");
+  for (int vi : v)
+    assert(vi == 0 && "omp_is_initial_device() returned true in the target region. std::for_each was not offloaded.");
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index 4f60b8dcc78838d..a39aa2cf2977f67 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This test verifies that you can overwrite the input in std::for_each. If the
-// result was not copied back from the device to the host, this test would fail.
+// This test verifies that you can overwrite the input in
+// std::for_each(std::execution::par_unseq,...). If the result was not copied
+// back from the device to the host, this test would fail.
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
@@ -22,43 +23,43 @@
 #include <vector>
 
 template <class _Tp, class _Predicate, class _Up>
-void overwrite(_Tp& __data, _Predicate __pred, const _Up& __value) {
-  // This function assumes that __pred will never be the identity transformation
-  // Filling array with __value
-  std::fill(std::execution::par_unseq, __data.begin(), __data.end(), __value);
+void overwrite(_Tp& data, _Predicate pred, const _Up& value) {
+  // This function assumes that pred will never be the identity transformation
 
   // Updating the array with a lambda
-  std::for_each(std::execution::par_unseq, __data.begin(), __data.end(), __pred);
+  std::for_each(std::execution::par_unseq, data.begin(), data.end(), pred);
 
   // Asserting that no elements have the intial value
-  auto __idx = std::find_if(
-      std::execution::par_unseq, __data.begin(), __data.end(), [&, __value](decltype(__data[0])& n) -> bool {
-        return n == __value;
-      });
-  assert(__idx == __data.end());
+  for (int di : data)
+    assert(
+        di != value &&
+        "The GPU implementation of std::for_each does not allow users to mutate the input as the C++ standard does.");
 }
 
 int main(void) {
-  const int __test_size = 10000;
+  const double value  = 2.0;
+  const int test_size = 10000;
   // Testing with vector of doubles
   {
-    std::vector<double> __v(__test_size);
-    overwrite(__v, [&](double& __n) { __n *= __n; }, 2.0);
+    std::vector<double> v(test_size, value);
+    overwrite(v, [&](double& n) { n *= n; }, value);
   }
   // Testing with vector of integers
   {
-    std::vector<int> __v(__test_size);
-    overwrite(__v, [&](int& __n) { __n *= __n; }, 2);
+    std::vector<int> v(test_size, (int)value);
+    overwrite(v, [&](int& n) { n *= n; }, (int)value);
   }
   // Testing with array of doubles
   {
-    std::array<double, __test_size> __a;
-    overwrite(__a, [&](double& __n) { __n *= __n; }, 2.0);
+    std::array<double, test_size> a;
+    a.fill(value);
+    overwrite(a, [&](double& n) { n *= n; }, value);
   }
   // Testing with array of integers
   {
-    std::array<int, __test_size> __a;
-    overwrite(__a, [&](int& __n) { __n *= __n; }, 2);
+    std::array<int, test_size> a;
+    a.fill((int)value);
+    overwrite(a, [&](int& n) { n *= n; }, (int)value);
   }
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp
index ec6d567d67226f1..77836d47b081209 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_40.verify.cpp
@@ -18,4 +18,4 @@
 
 #include <algorithm>
 
-// expected-error at __algorithm/pstl_backends/openmp/backend.h:26 {{"OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."}}
\ No newline at end of file
+// expected-error at __algorithm/pstl_backends/openmp/backend.h:26 {{"OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP."}}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp
index 881f0ee16e0a86a..ff0bf1cf67c8e81 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_45.verify.cpp
@@ -18,4 +18,4 @@
 
 #include <algorithm>
 
-// expected-no-diagnostics
\ No newline at end of file
+// expected-no-diagnostics
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp
index d9a0242606bd169..401285586caee2c 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/openmp_version_51.verify.cpp
@@ -18,4 +18,4 @@
 
 #include <algorithm>
 
-// expected-no-diagnostics
\ No newline at end of file
+// expected-no-diagnostics
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
index 755c1054b469e10..d813a8828fc52ee 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // This test will fail if the number of devices detected by OpenMP is larger
-// than zero but transform is not executed on the device.
+// than zero but std::transform(std::execution::par_unseq,...) is not executed
+// on the device.
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
@@ -26,38 +27,31 @@ int main(void) {
   if (omp_get_num_devices() < 1)
     return 0;
 
-  // Initializing test array
-  const int __test_size = 10000;
-  std::vector<int> __host(__test_size);
-  std::vector<int> __device(__test_size);
+  // Initializing test arrays
+  const int test_size = 10000;
+  std::vector<int> host(test_size);
+  std::vector<int> device(test_size);
   // Should execute on host
-  std::transform(std::execution::unseq, __host.begin(), __host.end(), __host.begin(), [](int& h) {
+  std::transform(std::execution::unseq, host.begin(), host.end(), host.begin(), [](int& h) {
     // Returns true if executed on the host
     h = omp_is_initial_device();
     return h;
   });
 
-  // Finding first index where omp_is_initial_device() returned true
-  auto __idx = std::find_if(std::execution::par_unseq, __host.begin(), __host.end(), [](int& n) -> bool { return n; });
-  assert(__idx == __host.begin() &&
-         "omp_is_initial_device() returned false. std::transform was offloaded but shouldn't be.");
+  // Asserting the std::transform(std::execution::unseq,...) executed on the host
+  for (int hi : host)
+    assert(hi && "omp_is_initial_device() returned false. std::transform was offloaded but shouldn't be.");
 
   // Should execute on device
   std::transform(
-      std::execution::par_unseq,
-      __device.begin(),
-      __device.end(),
-      __host.begin(),
-      __device.begin(),
-      [](int& d, int& h) {
+      std::execution::par_unseq, device.begin(), device.end(), host.begin(), device.begin(), [](int& d, int& h) {
         // Should return fals
         d = omp_is_initial_device();
         return h == d;
       });
 
-  // Finding first index where omp_is_initial_device() returned true
-  __idx = std::find_if(std::execution::par_unseq, __device.begin(), __device.end(), [](int& n) -> bool { return n; });
-  assert(__idx == __device.end() &&
-         "omp_is_initial_device() returned true in the target region. std::transform was not offloaded.");
+  // Asserting the std::transform(std::execution::par_unseq,...) executed on the device
+  for (int di : device)
+    assert(!di && "omp_is_initial_device() returned true in the target region. std::transform was not offloaded.");
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
index 072f0ffbaf9e6b6..20acd2be45f85ed 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // This test will fail if the number of devices detected by OpenMP is larger
-// than zero but transform_reduce is not executed on the device.
+// than zero but std::transform_reduce(std::execution::par_unseq,...) is not
+// executed on the device.
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
@@ -18,9 +19,9 @@
 #include <algorithm>
 #include <cassert>
 #include <execution>
+#include <functional>
 #include <vector>
 #include <omp.h>
-#include <functional>
 
 int main(void) {
   // We only run the test if a device is detected by OpenMP
@@ -28,18 +29,15 @@ int main(void) {
     return 0;
 
   // Initializing test array
-  const int __test_size = 10000;
-  std::vector<int> __v(__test_size);
-  std::vector<int> __w(__test_size);
-  std::for_each(std::execution::par_unseq, __v.begin(), __v.end(), [](int& n) { n = !omp_is_initial_device(); });
-
-  std::for_each(std::execution::par_unseq, __w.begin(), __w.end(), [](int& n) { n = !omp_is_initial_device(); });
+  const int test_size = 10000;
+  std::vector<int> v(test_size, 1);
+  std::vector<int> w(test_size, 1);
 
   int result = std::transform_reduce(
-      std::execution::par_unseq, __v.begin(), __v.end(), __w.begin(), (int)0, std::plus{}, [](int& n, int& m) {
-        return n + m + omp_is_initial_device();
+      std::execution::par_unseq, v.begin(), v.end(), w.begin(), (int)0, std::plus{}, [](int& n, int& m) {
+        return n + m + omp_is_initial_device(); // Gives 2 if executed on device, 3 if executed on host
       });
-  assert(result == 2 * __test_size &&
+  assert(result == 2 * test_size &&
          "omp_is_initial_device() returned true in the target region. std::transform_reduce was not offloaded.");
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
new file mode 100644
index 000000000000000..77046f1e52f64fc
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
@@ -0,0 +1,201 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that std::transform_reduce(std::execution::par_unseq,...)
+// can be offloaded for a number of supported binary operations. The following
+// binary operations should be supported for the reducer:
+// - std::plus
+// - std::minus
+// - std::multiplies
+// - std::logical_and
+// - std::logical_or
+// - std::bit_and
+// - std::bit_or
+// - std::bit_xor
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp
+
+// REQUIRES: openmp_pstl_backend
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <execution>
+#include <functional>
+#include <vector>
+#include <omp.h>
+#include <iostream>
+
+int main(void) {
+  // We only run the test if a device is detected by OpenMP
+  if (omp_get_num_devices() < 1)
+    return 0;
+
+  // Initializing test array
+  const int test_size = 10000;
+
+  //===--------------------------------------------------------------------===//
+  // Arithmetic binary operators
+  //===--------------------------------------------------------------------===//
+
+  // Addition with doubles
+  {
+    std::vector<double> v(test_size, 1.0);
+    std::vector<double> w(test_size, 2.0);
+    double result = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), w.begin(), 5.0, std::plus{}, [](double& a, double& b) {
+          return 0.5 * (b - a) * ((double)!omp_is_initial_device());
+        });
+    assert((std::abs(result - 0.5 * ((double)test_size) - 5.0) < 1e-8) &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::plus.");
+  }
+
+  // Subtraction of floats
+  {
+    std::vector<float> v(test_size, 1.0f);
+    std::vector<float> w(test_size, 1.5f);
+    float result = std::transform_reduce(
+        std::execution::par_unseq,
+        v.begin(),
+        v.end(),
+        w.begin(),
+        1.25 * ((float)test_size),
+        std::minus{},
+        [](float& a, float& b) { return 0.5 * (a + b) * ((float)!omp_is_initial_device()); });
+    assert((std::abs(result) < 1e-8f) &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the "
+           "intended effect for the binary operation std::minus.");
+  }
+
+  // Multiplication of doubles
+  {
+    std::vector<double> v(test_size, 1.0);
+    std::vector<double> w(test_size, 0.0001);
+    double result = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), w.begin(), -1.0, std::multiplies{}, [](double& a, double& b) {
+          return (a + b) * ((double)!omp_is_initial_device());
+        });
+    assert((std::abs(result + pow(1.0001, test_size)) < 1e-8) &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::multiplies.");
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Logical binary operators
+  //===--------------------------------------------------------------------===//
+
+  // Logical and
+  {
+    std::vector<int> v(test_size, 1);
+    // The result should be true with an initial value of 1
+    int result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 1, std::logical_and{}, [](int& a) {
+          return a && !omp_is_initial_device();
+        });
+    assert(result &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::logical_and.");
+
+    // And false by an initial value of 0
+    result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::logical_and{}, [](int& a) {
+      return a && !omp_is_initial_device();
+    });
+    assert(!result &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::logical_and.");
+  }
+
+  // Logical or
+  {
+    std::vector<int> v(test_size, 0);
+    // The result should be true with an initial value of 1
+    int result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 1, std::logical_or{}, [](int& a) {
+      return a && !omp_is_initial_device();
+    });
+    assert(result &&
+           "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the binary "
+           "operation std::logical_or.");
+
+    // And false by an initial value of 0
+    result = std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::logical_or{}, [](int& a) {
+      return a && !omp_is_initial_device();
+    });
+    assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                      "binary operation std::logical_or.");
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Birwise binary operators
+  //===--------------------------------------------------------------------===//
+
+  // Bitwise and
+  {
+    std::vector<unsigned int> v(test_size, 3);
+    std::vector<unsigned int> w(test_size, 2);
+    // For odd numbers the result should be true
+    int result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0x1, std::bit_and{}, [](unsigned int& a) {
+          return a + omp_is_initial_device();
+        });
+    assert(result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                     "binary operation std::bit_and.");
+
+    // For even numbers the result should be false
+    result =
+        std::transform_reduce(std::execution::par_unseq, w.begin(), w.end(), 0x1, std::bit_and{}, [](unsigned int& a) {
+          return a + omp_is_initial_device();
+        });
+    assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                      "binary operation std::bit_and.");
+  }
+
+  // Bitwise or
+  {
+    std::vector<unsigned int> v(test_size, 0);
+    int result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) {
+          return a || omp_is_initial_device();
+        });
+    assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                      "binary operation std::bit_or.");
+
+    // After adding a one, the result should be true
+    v[v.size() / 2] = 1;
+    result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) {
+          return a && !omp_is_initial_device();
+        });
+    assert(result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
+                     "binary operation std::bit_or.");
+  }
+
+  // Bitwise xor
+  {
+    std::vector<unsigned int> v(test_size, 0xef);
+    int result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_xor{}, [](unsigned int& a) {
+          return a << omp_is_initial_device();
+        });
+    assert(result == 0 && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for "
+                          "the binary operation std::bit_or.");
+
+    // After adding a one, the result should be true
+    v[v.size() / 2] = 0xea;
+    result =
+        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_xor{}, [](unsigned int& a) {
+          return a << omp_is_initial_device();
+        });
+    assert(result == 5 && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for "
+                          "the binary operation std::bit_or.");
+  }
+
+  return 0;
+}

>From d3da1ff49911c1146c2df0aa1839699fbb8afdc0 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 1 Nov 2023 10:05:03 -0700
Subject: [PATCH 33/46] Adding -fno-exceptions to tests to conform with
 https://github.com/llvm/llvm-project/pull/69669

---
 .../libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp  | 2 +-
 .../test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp  | 2 +-
 .../algorithms/alg.pstl.offload/find_if_offload.pass.cpp      | 2 +-
 .../algorithms/alg.pstl.offload/for_each_offload.pass.cpp     | 2 +-
 .../alg.pstl.offload/for_each_overwrite_input.pass.cpp        | 2 +-
 .../algorithms/alg.pstl.offload/transform_offload.pass.cpp    | 2 +-
 .../alg.pstl.offload/transform_reduce_offload.pass.cpp        | 2 +-
 .../transform_reduce_supported_binary_operations.pass.cpp     | 2 +-
 libcxx/utils/libcxx/test/params.py                            | 4 ++++
 9 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
index 1557a667b81e39c..7dc5983d8f8e995 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
index d368701f5712414..791e748a30c71c9 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
index 89a884de80984ba..e26c33e97f77bd4 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
index 6ee3310507c11db..7a78e4282a8cb0c 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index a39aa2cf2977f67..bfd2e2819d953e4 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -fopenmp --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
index d813a8828fc52ee..3f8c91b366ca4ca 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp --offload-arch=native
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
index 20acd2be45f85ed..229956215c6e120 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
index 77046f1e52f64fc..4f8974625f56349 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
@@ -20,7 +20,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index df6aa5aa0b7661c..b3b64215ef2413d 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -343,6 +343,10 @@ def getStdFlag(cfg, std):
             AddLinkFlag("-Wl,-rpath,%{lib}/../../lib"),
             AddLinkFlag("-L%{lib}/../../lib"),
             #  The preprocessor needs to find the omp.h header
+            # If OpenMP was installed as a project, the header lives in the
+            # following directory
+            AddFlag("-I %{lib}/../../projects/openmp/runtime/src/"),
+            # And if it was installed as a runtime it lives in
             AddFlag("-I %{lib}/../../runtimes/runtimes-bins/openmp/runtime/src"),
             # If the OpenMP PSTL backend was enbaled, we wish to run the tests for it
             AddFeature("openmp_pstl_backend")

>From d2f51b12b7835f87e2ae65847cff182995ed56d8 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 1 Nov 2023 16:46:17 -0700
Subject: [PATCH 34/46] Passing on environment variables to GPU offloading
 tests

---
 libcxx/docs/UsingLibcxx.rst                   |  9 ++--
 .../__algorithm/pstl_backends/openmp/fill.h   |  2 +-
 .../pstl_backends/openmp/find_if.h            |  2 +-
 .../pstl_backends/openmp/for_each.h           |  2 +-
 .../pstl_backends/openmp/transform.h          |  4 +-
 .../pstl_backends/openmp/transform_reduce.h   |  4 +-
 libcxx/test/configs/llvm-libc++-shared.cfg.in |  2 +-
 .../alg.pstl.offload/fill_offload.pass.cpp    |  2 +-
 .../alg.pstl.offload/find_if.pass.cpp         |  2 +-
 .../alg.pstl.offload/find_if_offload.pass.cpp |  2 +-
 .../for_each_offload.pass.cpp                 |  2 +-
 .../for_each_overwrite_input.pass.cpp         |  2 +-
 .../gpu_environemt_variables.pass.cpp         | 51 +++++++++++++++++++
 .../transform_offload.pass.cpp                |  2 +-
 .../transform_reduce_offload.pass.cpp         |  2 +-
 ...educe_supported_binary_operations.pass.cpp | 10 ++--
 libcxx/utils/libcxx/test/params.py            |  2 +-
 libcxx/utils/run.py                           | 15 ++++++
 18 files changed, 90 insertions(+), 27 deletions(-)
 create mode 100644 libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp

diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index c9726ef4f604018..ce8fb04a8ec1562 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -544,20 +544,17 @@ pointer can be obtained with `target map(from:<list of identifiers>)`.
 
   int main()
   {
-    int * a =  new int[LEN];
-    // Initialize the array to 2 on the device
-    std::fill(std::execution::par_unseq,a, a+LEN,2);
+    std::vector<int> a(LEN,2);
     // Get the device pointer for cube
     void (*dcube)(int& n);
     #pragma omp target map(from:dcube)
     dcube = &cube;
     // Pass the device function pointer to the parallel algorithm
-    std::for_each(std::execution::par_unseq,a, a+LEN,dcube);
+    std::for_each(std::execution::par_unseq,a.begin(), a.end(),dcube);
     // Validate that the result is 8 on the host for all array indices
-    std::for_each(std::execution::par,a, a+LEN,[&](int & n){
+    std::for_each(std::execution::par,a.begin(), a.end(),[&](int & n){
       assert(n == 8);
     });
-    delete[] a;
     return 0;
   }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
index 7a9c379491a5f21..6d2ba97df3c8096 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/fill.h
@@ -32,7 +32,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp, class _DifferenceType, class _Up>
 _LIBCPP_HIDE_FROM_ABI _Tp* __omp_fill(_Tp* __out1, _DifferenceType __n, const _Up& __value) noexcept {
   __par_backend::__omp_map_alloc(__out1, __n);
-#  pragma omp target teams distribute parallel for simd
+#  pragma omp target teams distribute parallel for
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     *(__out1 + __i) = __value;
   __par_backend::__omp_map_from(__out1, __n);
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
index 609638ebb3d2505..bf569a4e61520ac 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/find_if.h
@@ -31,7 +31,7 @@ template <class _Tp, class _DifferenceType, class _Predicate>
 _LIBCPP_HIDE_FROM_ABI _Tp* __omp_find_if(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept {
   __par_backend::__omp_map_to(__first, __n);
   _DifferenceType __idx = __n;
-#  pragma omp target teams distribute parallel for simd reduction(min : __idx)
+#  pragma omp target teams distribute parallel for reduction(min : __idx)
   for (_DifferenceType __i = 0; __i < __n; ++__i) {
     if (__pred(*(__first + __i))) {
       __idx = (__i < __idx) ? __i : __idx;
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
index e29255a4892ee41..9e5d49009b0e7ac 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/for_each.h
@@ -32,7 +32,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp, class _DifferenceType, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Tp* __omp_for_each(_Tp* __inout1, _DifferenceType __n, _Function __f) noexcept {
   __par_backend::__omp_map_to(__inout1, __n);
-#  pragma omp target teams distribute parallel for simd
+#  pragma omp target teams distribute parallel for
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(*(__inout1 + __i));
   __par_backend::__omp_map_from(__inout1, __n);
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
index 3353f74ed9166da..c3bfc4228a22522 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform.h
@@ -40,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp* __omp_transform(_Tp* __in1, _DifferenceType __n, _Up*
   // allocate the buffer on the device without copying the data.
   __par_backend::__omp_map_to(__in1, __n);
   __par_backend::__omp_map_alloc(__out1, __n);
-#  pragma omp target teams distribute parallel for simd
+#  pragma omp target teams distribute parallel for
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     *(__out1 + __i) = __f(*(__in1 + __i));
   // The order of the following two maps matters, since the user could legally
@@ -62,7 +62,7 @@ __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Funct
   __par_backend::__omp_map_to(__in1, __n);
   __par_backend::__omp_map_to(__in2, __n);
   __par_backend::__omp_map_alloc(__out1, __n);
-#  pragma omp target teams distribute parallel for simd
+#  pragma omp target teams distribute parallel for
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     *(__out1 + __i) = __f(*(__in1 + __i), *(__in2 + __i));
   // The order of the following three maps matters, since the user could legally
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
index 6a074fb6e4fa9be..de617423899a071 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/transform_reduce.h
@@ -48,7 +48,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
         std_op<_BinaryOperationType> __reduce,                                                                         \
         _UnaryOperation __transform) noexcept {                                                                        \
       __par_backend::__omp_map_to(__first, __n);                                                                       \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
+_PSTL_PRAGMA(omp target teams distribute parallel for reduction(omp_op:__init))                                        \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first + __i)));                                                      \
       __par_backend::__omp_map_release(__first, __n);                                                                  \
@@ -71,7 +71,7 @@ _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__in
         _UnaryOperation __transform) noexcept {                                                                        \
       __par_backend::__omp_map_to(__first1, __n);                                                                      \
       __par_backend::__omp_map_to(__first2, __n);                                                                      \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init))                                   \
+_PSTL_PRAGMA(omp target teams distribute parallel for reduction(omp_op:__init))                                        \
       for (_DifferenceType __i = 0; __i < __n; ++__i)                                                                  \
         __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i)));                                  \
       __par_backend::__omp_map_release(__first1, __n);                                                                 \
diff --git a/libcxx/test/configs/llvm-libc++-shared.cfg.in b/libcxx/test/configs/llvm-libc++-shared.cfg.in
index 143b3b3feae1109..8ffe69f6271510f 100644
--- a/libcxx/test/configs/llvm-libc++-shared.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared.cfg.in
@@ -24,4 +24,4 @@ libcxx.test.config.configure(
     libcxx.test.features.DEFAULT_FEATURES,
     config,
     lit_config
-)
+)
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
index 7dc5983d8f8e995..a97cb42845eb6f3 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
index 791e748a30c71c9..38544d76b86ad71 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
index e26c33e97f77bd4..82425be01c181fa 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget  -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
index 7a78e4282a8cb0c..e2a2797e5515dc7 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index bfd2e2819d953e4..8729b29ac072139 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
new file mode 100644
index 000000000000000..eb9265556cb0daf
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that the libc++ test configuration forwards the AMD and
+// NVIDIA environment variables specifying the visible devices. Intially when
+// developing the OpenMP offloading tests, this was not the case, and this test
+// will reveal if the configuration is wrong another time.
+
+// UNSUPPORTED: c++03, c++11, c++14, gcc
+
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget  -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
+
+// REQUIRES: openmp_pstl_backend
+
+#include <string>
+#include <cassert>
+#include <omp.h>
+#include <iostream>
+
+std::string get_env_var(std::string const& env_var_name, int& flag) {
+  char* val;
+  val                = getenv(env_var_name.c_str());
+  std::string retval = "";
+  flag               = (val != NULL);
+  return (val != NULL) ? val : "";
+}
+
+int main(void) {
+  // Stores whether the environment variable was found
+  int status = 0;
+
+  // Checking for AMD's enviroment variable for specifying visible devices
+  std::string rocr_visible_devices = get_env_var("ROCR_VISIBLE_DEVICES", status);
+  if (status)
+    assert(
+        (rocr_visible_devices.empty() || (omp_get_num_devices() > 0)) &&
+        "ROCR_VISIBLE_DEVICES was set but no devices were detected by OpenMP. The libc++ test suite is misconfigured.");
+
+  // Checking for NVIDIA's enviroment variable for specifying visible devices
+  std::string cuda_visible_devices = get_env_var("CUDA_VISIBLE_DEVICES", status);
+  if (status)
+    assert(
+        (cuda_visible_devices.empty() || (omp_get_num_devices() > 0)) &&
+        "CUDA_VISIBLE_DEVICES was set but no devices were detected by OpenMP. The libc++ test suite is misconfigured.");
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
index 3f8c91b366ca4ca..1be15fd15454ffb 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
index 229956215c6e120..30d45060c4ec28b 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
@@ -12,7 +12,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
index 4f8974625f56349..593a63e57c806b3 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
@@ -20,7 +20,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, gcc
 
-// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions
+// ADDITIONAL_COMPILE_FLAGS: -O2 -Wno-pass-failed -fopenmp -fno-exceptions --offload-arch=native -L%{lib}/../../lib -lomptarget -L%{lib}/../../projects/openmp/libomptarget/ -lomptarget.devicertl
 
 // REQUIRES: openmp_pstl_backend
 
@@ -160,8 +160,8 @@ int main(void) {
   // Bitwise or
   {
     std::vector<unsigned int> v(test_size, 0);
-    int result =
-        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) {
+    int result = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) -> unsigned int {
           return a || omp_is_initial_device();
         });
     assert(!result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
@@ -169,8 +169,8 @@ int main(void) {
 
     // After adding a one, the result should be true
     v[v.size() / 2] = 1;
-    result =
-        std::transform_reduce(std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) {
+    result          = std::transform_reduce(
+        std::execution::par_unseq, v.begin(), v.end(), 0, std::bit_or{}, [](unsigned int& a) -> unsigned int {
           return a && !omp_is_initial_device();
         });
     assert(result && "std::transform_reduce(std::execution::par_unseq,...) does not have the intended effect for the "
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index b3b64215ef2413d..5f99533d5b4bebc 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -341,7 +341,7 @@ def getStdFlag(cfg, std):
             AddCompileFlag("-fopenmp"),
             # The linker needs to find the correct version of libomptarget
             AddLinkFlag("-Wl,-rpath,%{lib}/../../lib"),
-            AddLinkFlag("-L%{lib}/../../lib"),
+            #AddLinkFlag("-L%{lib}/../../lib -lomptarget"),
             #  The preprocessor needs to find the omp.h header
             # If OpenMP was installed as a project, the header lives in the
             # following directory
diff --git a/libcxx/utils/run.py b/libcxx/utils/run.py
index 6b4d615444bcfa5..5baffadb35ec06b 100755
--- a/libcxx/utils/run.py
+++ b/libcxx/utils/run.py
@@ -63,6 +63,21 @@ def main():
         # TEMP is needed for placing temp files in a sensible directory.
         if "TEMP" in os.environ:
             env["TEMP"] = os.environ.get("TEMP")
+    
+    # Forwarding the environment variable CUDA_VISIBLE_DEVICES which configures
+    # the visible NVIDIA GPUs.
+    if 'CUDA_VISIBLE_DEVICES' in os.environ:
+        env['CUDA_VISIBLE_DEVICES'] = os.environ['CUDA_VISIBLE_DEVICES']
+
+    # Forwarding the environment variable ROCR_VISIBLE_DEVICES which configures
+    # the visible AMD GPUs.
+    if 'ROCR_VISIBLE_DEVICES' in os.environ:
+        env['ROCR_VISIBLE_DEVICES'] = os.environ['ROCR_VISIBLE_DEVICES']
+
+    # Pass the OpenMP debug flag. Can be used to print information about the
+    # GPU execution of the tests.
+    if 'LIBOMPTARGET_DEBUG' in os.environ:
+        env['LIBOMPTARGET_DEBUG'] = os.environ['LIBOMPTARGET_DEBUG']
 
     # Run the command line with the given environment in the execution directory.
     return subprocess.call(commandLine, cwd=args.execdir, env=env, shell=False)

>From 4c00093768a19ab70321b79a12f22b0ae048bb5b Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 10:11:15 -0700
Subject: [PATCH 35/46] Clang-formatted for_each_overwrite_input.pass.cpp

---
 .../for_each_overwrite_input.pass.cpp                | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index 8729b29ac072139..5c984fca88a4681 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -42,24 +42,28 @@ int main(void) {
   // Testing with vector of doubles
   {
     std::vector<double> v(test_size, value);
-    overwrite(v, [&](double& n) { n *= n; }, value);
+    overwrite(
+        v, [&](double& n) { n *= n; }, value);
   }
   // Testing with vector of integers
   {
     std::vector<int> v(test_size, (int)value);
-    overwrite(v, [&](int& n) { n *= n; }, (int)value);
+    overwrite(
+        v, [&](int& n) { n *= n; }, (int)value);
   }
   // Testing with array of doubles
   {
     std::array<double, test_size> a;
     a.fill(value);
-    overwrite(a, [&](double& n) { n *= n; }, value);
+    overwrite(
+        a, [&](double& n) { n *= n; }, value);
   }
   // Testing with array of integers
   {
     std::array<int, test_size> a;
     a.fill((int)value);
-    overwrite(a, [&](int& n) { n *= n; }, (int)value);
+    overwrite(
+        a, [&](int& n) { n *= n; }, (int)value);
   }
   return 0;
 }

>From e48ec851e2e3e7bcbba1f09caefb4cde1be90e7b Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 10:14:07 -0700
Subject: [PATCH 36/46] clang-formatted omp_offload.h and fill_offload.pass.cpp

---
 .../__algorithm/pstl_backends/openmp/omp_offload.h   |  8 ++++----
 .../alg.pstl.offload/fill_offload.pass.cpp           |  6 +++---
 libcxx/utils/libcxx/test/params.py                   | 12 ++++++------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index d47fdcd39c39b25..b2e0d46d1fcd254 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -34,28 +34,28 @@ template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_to([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(to : __p[0 : __len])
+#  pragma omp target enter data map(to : __p [0:__len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_from([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(from : __p[0 : __len])
+#  pragma omp target exit data map(from : __p [0:__len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_alloc([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(alloc : __p[0 : __len])
+#  pragma omp target enter data map(alloc : __p [0:__len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_release([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(release : __p[0 : __len])
+#  pragma omp target exit data map(release : __p [0:__len])
 }
 
 } // namespace __omp_backend
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
index a97cb42845eb6f3..c6e14b13e9b1864 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -35,7 +35,7 @@ int main(void) {
   // the device, because the map inside std::fill will then only increment and
   // decrement reference counters and not move data.
   int* data = v.data();
-#pragma omp target enter data map(to : data[0 : v.size()])
+#pragma omp target enter data map(to : data [0:v.size()])
   std::fill(std::execution::par_unseq, v.begin(), v.end(), -2);
 
   // At this point v should only contain the value 2
@@ -44,11 +44,11 @@ int main(void) {
            "std::fill transferred data from device to the host but should only have decreased the reference counter.");
 
 // After moving the result back to the host it should now be -2
-#pragma omp target update from(data[0 : v.size()])
+#pragma omp target update from(data [0:v.size()])
   for (int vi : v)
     assert(vi == -2 && "std::fill did not update the result on the device.");
 
-#pragma omp target exit data map(delete : data[0 : v.size()])
+#pragma omp target exit data map(delete : data [0:v.size()])
 
   return 0;
 }
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index 5f99533d5b4bebc..ffc731b93fc0698 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -341,14 +341,14 @@ def getStdFlag(cfg, std):
             AddCompileFlag("-fopenmp"),
             # The linker needs to find the correct version of libomptarget
             AddLinkFlag("-Wl,-rpath,%{lib}/../../lib"),
-            #AddLinkFlag("-L%{lib}/../../lib -lomptarget"),
-            #  The preprocessor needs to find the omp.h header
-            # If OpenMP was installed as a project, the header lives in the
-            # following directory
+            # The preprocessor needs to find the omp.h header. If OpenMP was 
+            # installed as a project, the header lives in the following
+            # directory
             AddFlag("-I %{lib}/../../projects/openmp/runtime/src/"),
-            # And if it was installed as a runtime it lives in
+            # And if it was installed as a runtime it lives in the following:
             AddFlag("-I %{lib}/../../runtimes/runtimes-bins/openmp/runtime/src"),
-            # If the OpenMP PSTL backend was enbaled, we wish to run the tests for it
+            # If the OpenMP PSTL backend was enbaled, we wish to run the tests
+            # for it
             AddFeature("openmp_pstl_backend")
         ] if enabled else [],
     )

>From e91918b4f007ec318b6abdab0d4bfae64e519bc7 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 10:19:39 -0700
Subject: [PATCH 37/46] Changed single quotes to double quotes in run.py

---
 libcxx/utils/run.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libcxx/utils/run.py b/libcxx/utils/run.py
index 5baffadb35ec06b..e172449cf3d18b6 100755
--- a/libcxx/utils/run.py
+++ b/libcxx/utils/run.py
@@ -66,18 +66,18 @@ def main():
     
     # Forwarding the environment variable CUDA_VISIBLE_DEVICES which configures
     # the visible NVIDIA GPUs.
-    if 'CUDA_VISIBLE_DEVICES' in os.environ:
-        env['CUDA_VISIBLE_DEVICES'] = os.environ['CUDA_VISIBLE_DEVICES']
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        env["CUDA_VISIBLE_DEVICES"] = os.environ["CUDA_VISIBLE_DEVICES"]
 
     # Forwarding the environment variable ROCR_VISIBLE_DEVICES which configures
     # the visible AMD GPUs.
-    if 'ROCR_VISIBLE_DEVICES' in os.environ:
-        env['ROCR_VISIBLE_DEVICES'] = os.environ['ROCR_VISIBLE_DEVICES']
+    if "ROCR_VISIBLE_DEVICES" in os.environ:
+        env["ROCR_VISIBLE_DEVICES"] = os.environ["ROCR_VISIBLE_DEVICES"]
 
     # Pass the OpenMP debug flag. Can be used to print information about the
     # GPU execution of the tests.
-    if 'LIBOMPTARGET_DEBUG' in os.environ:
-        env['LIBOMPTARGET_DEBUG'] = os.environ['LIBOMPTARGET_DEBUG']
+    if "LIBOMPTARGET_DEBUG" in os.environ:
+        env["LIBOMPTARGET_DEBUG"] = os.environ["LIBOMPTARGET_DEBUG"]
 
     # Run the command line with the given environment in the execution directory.
     return subprocess.call(commandLine, cwd=args.execdir, env=env, shell=False)

>From 5225daf82b91527be7470cc1d5f88eb3fee9e3f0 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 10:43:59 -0700
Subject: [PATCH 38/46] Did I finally clang-format omp_offload.h,
 fill_offload.pass.cpp, and for_each_overwrite_input.pass.cpp right?

---
 .../__algorithm/pstl_backends/openmp/omp_offload.h   |  8 ++++----
 .../alg.pstl.offload/fill_offload.pass.cpp           |  6 +++---
 .../for_each_overwrite_input.pass.cpp                | 12 ++++--------
 3 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index b2e0d46d1fcd254..d47fdcd39c39b25 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -34,28 +34,28 @@ template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_to([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(to : __p [0:__len])
+#  pragma omp target enter data map(to : __p[0 : __len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_from([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(from : __p [0:__len])
+#  pragma omp target exit data map(from : __p[0 : __len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_alloc([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target enter data map(alloc : __p [0:__len])
+#  pragma omp target enter data map(alloc : __p[0 : __len])
 }
 
 template <class _Iterator, class _DifferenceType>
 _LIBCPP_HIDE_FROM_ABI void
 __omp_map_release([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept {
   static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value);
-#  pragma omp target exit data map(release : __p [0:__len])
+#  pragma omp target exit data map(release : __p[0 : __len])
 }
 
 } // namespace __omp_backend
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
index c6e14b13e9b1864..a97cb42845eb6f3 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -35,7 +35,7 @@ int main(void) {
   // the device, because the map inside std::fill will then only increment and
   // decrement reference counters and not move data.
   int* data = v.data();
-#pragma omp target enter data map(to : data [0:v.size()])
+#pragma omp target enter data map(to : data[0 : v.size()])
   std::fill(std::execution::par_unseq, v.begin(), v.end(), -2);
 
   // At this point v should only contain the value 2
@@ -44,11 +44,11 @@ int main(void) {
            "std::fill transferred data from device to the host but should only have decreased the reference counter.");
 
 // After moving the result back to the host it should now be -2
-#pragma omp target update from(data [0:v.size()])
+#pragma omp target update from(data[0 : v.size()])
   for (int vi : v)
     assert(vi == -2 && "std::fill did not update the result on the device.");
 
-#pragma omp target exit data map(delete : data [0:v.size()])
+#pragma omp target exit data map(delete : data[0 : v.size()])
 
   return 0;
 }
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index 5c984fca88a4681..8729b29ac072139 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -42,28 +42,24 @@ int main(void) {
   // Testing with vector of doubles
   {
     std::vector<double> v(test_size, value);
-    overwrite(
-        v, [&](double& n) { n *= n; }, value);
+    overwrite(v, [&](double& n) { n *= n; }, value);
   }
   // Testing with vector of integers
   {
     std::vector<int> v(test_size, (int)value);
-    overwrite(
-        v, [&](int& n) { n *= n; }, (int)value);
+    overwrite(v, [&](int& n) { n *= n; }, (int)value);
   }
   // Testing with array of doubles
   {
     std::array<double, test_size> a;
     a.fill(value);
-    overwrite(
-        a, [&](double& n) { n *= n; }, value);
+    overwrite(a, [&](double& n) { n *= n; }, value);
   }
   // Testing with array of integers
   {
     std::array<int, test_size> a;
     a.fill((int)value);
-    overwrite(
-        a, [&](int& n) { n *= n; }, (int)value);
+    overwrite(a, [&](int& n) { n *= n; }, (int)value);
   }
   return 0;
 }

>From dc60a4dc279e6f86f13eeb0a5169353a79ddf8fe Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 12:05:10 -0700
Subject: [PATCH 39/46] run.py whitespace

---
 libcxx/utils/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/utils/run.py b/libcxx/utils/run.py
index e172449cf3d18b6..427e96b91fa3705 100755
--- a/libcxx/utils/run.py
+++ b/libcxx/utils/run.py
@@ -63,7 +63,7 @@ def main():
         # TEMP is needed for placing temp files in a sensible directory.
         if "TEMP" in os.environ:
             env["TEMP"] = os.environ.get("TEMP")
-    
+
     # Forwarding the environment variable CUDA_VISIBLE_DEVICES which configures
     # the visible NVIDIA GPUs.
     if "CUDA_VISIBLE_DEVICES" in os.environ:

>From 3137fc6e8839d80530c9fa76c3edac3f99f2627b Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 13:32:20 -0700
Subject: [PATCH 40/46] Fixed code formatting in .rst documentation

---
 libcxx/CMakeLists.txt                                  |  7 +++++--
 libcxx/docs/UsingLibcxx.rst                            | 10 +++++-----
 .../__algorithm/pstl_backends/openmp/omp_offload.h     |  2 +-
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 71ba749fd120dd5..07548c367ccb67a 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -559,6 +559,11 @@ function(cxx_add_basic_build_flags target)
     endif()
   endif()
   target_compile_options(${target} PUBLIC "${LIBCXX_ADDITIONAL_COMPILE_FLAGS}")
+
+  # If the PSTL backend depends on OpenMP, we must enable the OpenMP tool chain
+  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+    target_add_compile_flags_if_supported(${target} PUBLIC -fopenmp)
+  endif()
 endfunction()
 
 # Exception flags =============================================================
@@ -782,8 +787,6 @@ elseif (LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
   config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
-  # Making sure that OpenMP is enabled during build
-  add_compile_options(-fopenmp)
 else()
   message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
                        Valid backends are: serial, std-thread, libdispatch, and openmp.")
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index ce8fb04a8ec1562..da267b1fe56ffe9 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -477,8 +477,8 @@ iterators for ``std::vector`` or ``std::array``.
 To enable the OpenMP offloading backend it must be selected with
 ``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when
 compiling a program, the user must specify the command line options
-``-fopenmp -fexperimental-library -stdlib=libc++``. To install LLVM with OpenMP
-offloading enabled, please read
+``-fopenmp -fexperimental-library``. To install LLVM with OpenMP offloading
+enabled, please read
 `the LLVM OpenMP FAQ. <https://openmp.llvm.org/SupportAndFAQ.html>`_ 
 You may also want to to visit
 `the OpenMP offloading command-line argument reference. <https://openmp.llvm.org/CommandLineArgumentReference.html#offload-command-line-arguments>`_ 
@@ -531,9 +531,9 @@ space. However, discrete GPU systems have distinct address spaces. A single
 address space can be emulated if your system supports unified shared memory.
 However, many discrete GPU systems do not, and in those cases it is important to
 pass device function pointers to the parallel algorithms. Below is an example of
-how the OpenMP `declare target` directive can be used to mark that a function
+how the OpenMP ``declare target`` directive can be used to mark that a function
 should be compiled for both host and device. The device address of a function
-pointer can be obtained with `target map(from:<list of identifiers>)`.
+pointer can be obtained with ``target map(from:<list of identifiers>)``.
 
 .. code-block:: cpp
 
@@ -559,7 +559,7 @@ pointer can be obtained with `target map(from:<list of identifiers>)`.
   }
 
 Without unified shared memory, the above example will not work if the host
-function pointer `cube` is passed to the parallel algorithm.
+function pointer ``cube`` is passed to the parallel algorithm.
 
 Important notes about exception handling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
index d47fdcd39c39b25..d2bcfbb98754b34 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h
@@ -27,7 +27,7 @@ inline namespace __omp_backend {
 //===----------------------------------------------------------------------===//
 // The following four functions can be used to map contiguous array sections to
 // and from the device. For now, they are simple overlays of the OpenMP pragmas,
-// but they should be updated wen adding support for other iterator types.
+// but they should be updated when adding support for other iterator types.
 //===----------------------------------------------------------------------===//
 
 template <class _Iterator, class _DifferenceType>

>From 2fac869415593d2f3ebbc0af2b8a5c9ebe40432e Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 15:19:20 -0700
Subject: [PATCH 41/46] Improved formatting of examples in documentation

---
 libcxx/docs/UsingLibcxx.rst | 56 ++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index da267b1fe56ffe9..ecad68eac9a434c 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -487,18 +487,19 @@ Example
 ~~~~~~~
 
 The following is an example of offloading vector addition to a GPU using our
-standard library extension.
+standard library extension. It implements the classical vector addition from
+BLAS that overwrites the vector ``y`` with ``y=a*x+y``. Thus ``y.begin()`` is
+both used as an input and an output iterator in this example.
 
 .. code-block:: cpp
 
   #include <algorithm>
   #include <execution>
 
-  template<typename T1, typename T2, typename T3>
-  void axpy(const T1 a,std::vector<T2>& x, std::vector<T3>& y)
-  {
-    std::transform(std::execution::par_unseq,x.begin(),x.end(), y.begin(), y.begin(),
-                  [=](T2 xi, T3 yi){ return a*xi + yi; });
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [=](T2 xi, T3 yi) { return a * xi + yi; });
   }
 
 The execution policy ``std::execution::par_unseq`` states that the algorithm's
@@ -512,12 +513,11 @@ be implemented in the following way.
 
 .. code-block:: cpp
 
-  template<typename T1, typename T2, typename T3>
-  void axpy(const T1 a,std::vector<T2>& x, std::vector<T3>& y)
-  {
-  # pragma omp target data map(to:a)
-    std::transform(std::execution::par_unseq,x.begin(),x.end(), y.begin(), y.begin(),
-                  [&](T2 xi, T3 yi){ return a*xi + yi; });
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+  #pragma omp target data map(to : a)
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [&](T2 xi, T3 yi) { return a * xi + yi; });
   }
 
 However, if unified shared memory, USM, is enabled, no additional data mapping
@@ -539,27 +539,31 @@ pointer can be obtained with ``target map(from:<list of identifiers>)``.
 
   // Declare that the function must be compiled for both host and device
   #pragma omp declare target
-  void cube(int& n) {n*=n*n; };
+  // This function computes the squared difference of two floating points
+  float squared(float a, float b) { return a * a - 2.0f * a * b + b * b; };
   #pragma omp end declare target
 
-  int main()
-  {
-    std::vector<int> a(LEN,2);
-    // Get the device pointer for cube
-    void (*dcube)(int& n);
-    #pragma omp target map(from:dcube)
-    dcube = &cube;
+  int main() {
+    std::vector<float> a(100, 1.0);
+    std::vector<float> b(100, 1.25);
+
+    // Get the device pointer for squared
+    float (*dev_squared)(float, float);
+  #pragma omp target map(from : dev_squared)
+    dev_squared = &squared;
+
     // Pass the device function pointer to the parallel algorithm
-    std::for_each(std::execution::par_unseq,a.begin(), a.end(),dcube);
-    // Validate that the result is 8 on the host for all array indices
-    std::for_each(std::execution::par,a.begin(), a.end(),[&](int & n){
-      assert(n == 8);
-    });
+    float sum =
+        std::transform_reduce(std::execution::par_unseq, a.begin(), a.end(),
+                              b.begin(), 0.0f, std::plus{}, dev_squared);
+
+    // Validate that the result is approximately 6.25
+    assert(std::abs(sum - 6.25f) < 1e-10);
     return 0;
   }
 
 Without unified shared memory, the above example will not work if the host
-function pointer ``cube`` is passed to the parallel algorithm.
+function pointer ``squared`` is passed to the parallel algorithm.
 
 Important notes about exception handling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

>From 7e9d846fb8b626429840d7cf76cd143f6d8ccd9d Mon Sep 17 00:00:00 2001
From: Anton Rydahl <44206479+AntonRydahl at users.noreply.github.com>
Date: Thu, 2 Nov 2023 15:54:18 -0700
Subject: [PATCH 42/46] Update libcxx/docs/UsingLibcxx.rst

Co-authored-by: Louis Dionne <ldionne.2 at gmail.com>
---
 libcxx/docs/UsingLibcxx.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index ecad68eac9a434c..94e92982bc65194 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -569,7 +569,7 @@ Important notes about exception handling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 GPU architectures do not support exception handling. If compiling a program
-containing parallel algorithms with ``clang`` 18 or newer, a program with
+containing parallel algorithms with current versions of Clang, a program with
 exceptions in offloaded code regions will compile, but the program will
 terminate if an exception is thrown on the device. This does not conform with
 the C++ standard and exception handling on GPUs will hopefully be better

>From e512875c079e65628163017885482e7d417a86c5 Mon Sep 17 00:00:00 2001
From: Anton Rydahl <44206479+AntonRydahl at users.noreply.github.com>
Date: Thu, 2 Nov 2023 15:55:10 -0700
Subject: [PATCH 43/46] Update
 libcxx/include/__algorithm/pstl_backends/openmp.h

Co-authored-by: Louis Dionne <ldionne.2 at gmail.com>
---
 libcxx/include/__algorithm/pstl_backends/openmp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index a3b52456dc6a609..e70ba1a0217912d 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -60,7 +60,7 @@ Exceptions
 Currently, GPU architectures do not handle exceptions. OpenMP target regions are
 allowed to contain try/catch statements and throw expressions in Clang, but if a
 throw expression is reached, it will terminate the program. That does not
-conform with the C++ standard.
+conform to the C++ standard.
 
 [This document](https://eel.is/c++draft/algorithms.parallel) has been used as
 reference for these considerations.

>From 02c1f3c791f9720bb7d0ff16a89ef856b8839b0a Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 16:02:28 -0700
Subject: [PATCH 44/46] Change function declaration for main functions from int
 main(void) to int main(int,char**)

---
 .../libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp    | 2 +-
 libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp | 2 +-
 .../libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp | 2 +-
 .../algorithms/alg.pstl.offload/for_each_offload.pass.cpp       | 2 +-
 .../alg.pstl.offload/for_each_overwrite_input.pass.cpp          | 2 +-
 .../alg.pstl.offload/gpu_environemt_variables.pass.cpp          | 2 +-
 .../algorithms/alg.pstl.offload/transform_offload.pass.cpp      | 2 +-
 .../alg.pstl.offload/transform_reduce_offload.pass.cpp          | 2 +-
 .../transform_reduce_supported_binary_operations.pass.cpp       | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
index a97cb42845eb6f3..c74384519cec34f 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/fill_offload.pass.cpp
@@ -22,7 +22,7 @@
 #include <vector>
 #include <omp.h>
 
-int main(void) {
+int main(int, char**) {
   // We only run the test if a device is detected by OpenMP
   if (omp_get_num_devices() < 1)
     return 0;
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
index 38544d76b86ad71..7d379c032d6122a 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if.pass.cpp
@@ -41,7 +41,7 @@ void check_find_if(_Tp& data) {
   }
 }
 
-int main(void) {
+int main(int, char**) {
   const int test_size = 10000;
   // Testing with vector of doubles
   {
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
index 82425be01c181fa..72cc4568aa26eb2 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/find_if_offload.pass.cpp
@@ -22,7 +22,7 @@
 #include <vector>
 #include <omp.h>
 
-int main(void) {
+int main(int, char**) {
   // We only run the test if a device is detected by OpenMP
   if (omp_get_num_devices() < 1)
     return 0;
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
index e2a2797e5515dc7..18417c3c9f0324d 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_offload.pass.cpp
@@ -22,7 +22,7 @@
 #include <vector>
 #include <omp.h>
 
-int main(void) {
+int main(int, char**) {
   // We only run the test if a device is detected by OpenMP
   if (omp_get_num_devices() < 1)
     return 0;
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
index 8729b29ac072139..5151984e7969c9b 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/for_each_overwrite_input.pass.cpp
@@ -36,7 +36,7 @@ void overwrite(_Tp& data, _Predicate pred, const _Up& value) {
         "The GPU implementation of std::for_each does not allow users to mutate the input as the C++ standard does.");
 }
 
-int main(void) {
+int main(int, char**) {
   const double value  = 2.0;
   const int test_size = 10000;
   // Testing with vector of doubles
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
index eb9265556cb0daf..0977f7a1f810482 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/gpu_environemt_variables.pass.cpp
@@ -30,7 +30,7 @@ std::string get_env_var(std::string const& env_var_name, int& flag) {
   return (val != NULL) ? val : "";
 }
 
-int main(void) {
+int main(int, char**) {
   // Stores whether the environment variable was found
   int status = 0;
 
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
index 1be15fd15454ffb..74255ce0a99e2f1 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_offload.pass.cpp
@@ -22,7 +22,7 @@
 #include <vector>
 #include <omp.h>
 
-int main(void) {
+int main(int, char**) {
   // We only run the test if a device is detected by OpenMP
   if (omp_get_num_devices() < 1)
     return 0;
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
index 30d45060c4ec28b..d141ff307bb701c 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_offload.pass.cpp
@@ -23,7 +23,7 @@
 #include <vector>
 #include <omp.h>
 
-int main(void) {
+int main(int, char**) {
   // We only run the test if a device is detected by OpenMP
   if (omp_get_num_devices() < 1)
     return 0;
diff --git a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
index 593a63e57c806b3..5cc57888c4549f4 100644
--- a/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.pstl.offload/transform_reduce_supported_binary_operations.pass.cpp
@@ -33,7 +33,7 @@
 #include <omp.h>
 #include <iostream>
 
-int main(void) {
+int main(int, char**) {
   // We only run the test if a device is detected by OpenMP
   if (omp_get_num_devices() < 1)
     return 0;

>From 9690aea32027252833b3cefa0742741888b54f1c Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 16:34:30 -0700
Subject: [PATCH 45/46] Improved documentation in
 include/__algorithm/pstl_backends/openmp.h

---
 .../__algorithm/pstl_backends/openmp.h        | 128 ++++++++++--------
 1 file changed, 71 insertions(+), 57 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/openmp.h b/libcxx/include/__algorithm/pstl_backends/openmp.h
index e70ba1a0217912d..fdd23e051e5596e 100644
--- a/libcxx/include/__algorithm/pstl_backends/openmp.h
+++ b/libcxx/include/__algorithm/pstl_backends/openmp.h
@@ -11,65 +11,79 @@
 
 #include <__config>
 
-/*
-Combined OpenMP CPU and GPU Backend
-===================================
-Contrary to the CPU backends found in ./cpu_backends/, the OpenMP backend can
-target both CPUs and GPUs. The OpenMP standard defines that when offloading code
-to an accelerator, the compiler must generate a fallback code for execution on
-the host. Thereby, the backend works as a CPU backend if no targeted accelerator
-is available at execution time. The target regions can also be compiled directly
-for a CPU architecture, for instance by adding the command-line option
-`-fopenmp-targets=x86_64-pc-linux-gnu` in Clang.
-
-When is an Algorithm Offloaded?
--------------------------------
-Only parallel algorithms with the parallel unsequenced execution policy are
-offloaded to the device. We cannot offload parallel algorithms with a parallel
-execution policy to GPUs because invocations executing in the same thread "are
-indeterminately sequenced with respect to each other" which we cannot guarantee
-on a GPU.
-
-The standard draft states that "the semantics [...] allow the implementation to
-fall back to sequential execution if the system cannot parallelize an algorithm
-invocation". If it is not deemed safe to offload the parallel algorithm to the
-device, we first fall back to a parallel unsequenced implementation from
-./cpu_backends. The CPU implementation may then fall back to sequential
-execution. In that way we strive to achieve the best possible performance.
-
-Further, "it is the caller's responsibility to ensure that the invocation does
-not introduce data races or deadlocks."
-
-Implicit Assumptions
---------------------
-If the user provides a function pointer as an argument to a parallel algorithm,
-it is assumed that it is the device pointer as there is currently no way to
-check whether a host or device pointer was passed.
-
-Mapping Clauses
----------------
-In some of the parallel algorithms, the user is allowed to provide the same
-iterator as input and output. Hence, the order of the maps matters. Therefore,
-`pragma omp target data map(to:...)` must be used before
-`pragma omp target data map(alloc:...)`. Conversely, the maps with map modifier
-`release` must be placed before the maps with map modifier `from` when
-transferring the result from the device to the host.
-
-Exceptions
-----------
-Currently, GPU architectures do not handle exceptions. OpenMP target regions are
-allowed to contain try/catch statements and throw expressions in Clang, but if a
-throw expression is reached, it will terminate the program. That does not
-conform to the C++ standard.
-
-[This document](https://eel.is/c++draft/algorithms.parallel) has been used as
-reference for these considerations.
-
-*/
-
-#include <__algorithm/pstl_backends/openmp/backend.h>
+// Combined OpenMP CPU and GPU Backend
+// ===================================
+// Contrary to the CPU backends found in ./cpu_backends/, the OpenMP backend can
+// target both CPUs and GPUs. The OpenMP standard defines that when offloading
+// code to an accelerator, the compiler must generate a fallback code for
+// execution on the host. Thereby, the backend works as a CPU backend if no
+// targeted accelerator is available at execution time. The target regions can
+// also be compiled directly for a CPU architecture, for instance by adding the
+// command-line option `-fopenmp-targets=x86_64-pc-linux-gnu` in Clang.
+//
+// When is an Algorithm Offloaded?
+// -------------------------------
+// Only parallel algorithms with the parallel unsequenced execution policy are
+// offloaded to the device. We cannot offload parallel algorithms with a
+// parallel execution policy to GPUs because invocations executing in the same
+// thread "are indeterminately sequenced with respect to each other" which we
+// cannot guarantee on a GPU.
+//
+// The standard draft states that "the semantics [...] allow the implementation
+// to fall back to sequential execution if the system cannot parallelize an
+// algorithm invocation". If it is not deemed safe to offload the parallel
+// algorithm to the device, we first fall back to a parallel unsequenced
+// implementation from ./cpu_backends. The CPU implementation may then fall back
+// to sequential execution. In that way we strive to achieve the best possible
+// performance.
+//
+// Further, "it is the caller's responsibility to ensure that the invocation
+// does not introduce data races or deadlocks."
+//
+// Implicit Assumptions
+// --------------------
+// If the user provides a function pointer as an argument to a parallel
+// algorithm, it is assumed that it is the device pointer as there is currently
+// no way to check whether a host or device pointer was passed.
+//
+// Mapping Clauses
+// ---------------
+// In some of the parallel algorithms, the user is allowed to provide the same
+// iterator as input and output. The order of the maps matters because OpenMP
+// keeps a reference counter of which variables have been mapped to the device.
+// Thereby, a varible is only copied to the device if its reference counter is
+// incremented from zero, and it is only copied back to the host when the
+// reference counter is decremented to zero again.
+// This allows nesting mapped regions, for instance in recursive functions,
+// without enforcing a lot of unnecessary data movement.
+// Therefore, `pragma omp target data map(to:...)` must be used before
+// `pragma omp target data map(alloc:...)`. Conversely, the maps with map
+// modifier `release` must be placed before the maps with map modifier `from`
+// when transferring the result from the device to the host.
+//
+// Example: Assume `a` and `b` are pointers to the same array.
+// ``` C++
+// #pragma omp target enter data map(alloc:a[0:n])
+// // The reference counter is incremented from 0 to 1. a is not copied to the
+// // device because of the `alloc` map modifier.
+// #pragma omp target enter data map(to:b[0:n])
+// // The reference counter is incremented from 1 to 2. b is not copied because
+// // the reference counter is positive. Therefore b, and a, are uninitialized
+// // on the device.
+// ```
+//
+// Exceptions
+// ----------
+// Currently, GPU architectures do not handle exceptions. OpenMP target regions
+// are allowed to contain try/catch statements and throw expressions in Clang,
+// but if a throw expression is reached, it will terminate the program. That
+// does not conform to the C++ standard.
+//
+// [This document](https://eel.is/c++draft/algorithms.parallel) has been used as
+// reference for these considerations.
 
 #include <__algorithm/pstl_backends/openmp/any_of.h>
+#include <__algorithm/pstl_backends/openmp/backend.h>
 #include <__algorithm/pstl_backends/openmp/fill.h>
 #include <__algorithm/pstl_backends/openmp/find_if.h>
 #include <__algorithm/pstl_backends/openmp/for_each.h>

>From bf31c50a9ba23e55fcd8d72a9e5b7d3b7a641cf3 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 2 Nov 2023 16:42:55 -0700
Subject: [PATCH 46/46] Removing accidental change to
 libcxx/test/configs/llvm-libc++-shared.cfg.in

---
 libcxx/test/configs/llvm-libc++-shared.cfg.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/configs/llvm-libc++-shared.cfg.in b/libcxx/test/configs/llvm-libc++-shared.cfg.in
index 8ffe69f6271510f..143b3b3feae1109 100644
--- a/libcxx/test/configs/llvm-libc++-shared.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared.cfg.in
@@ -24,4 +24,4 @@ libcxx.test.config.configure(
     libcxx.test.features.DEFAULT_FEATURES,
     config,
     lit_config
-)
\ No newline at end of file
+)