[flang-commits] [flang] Adding Separate OpenMP Offloading Backend to `libcxx/include/__algorithm/pstl_backends` (PR #66968)

Mon Sep 25 10:21:29 PDT 2023

https://github.com/AntonRydahl updated https://github.com/llvm/llvm-project/pull/66968

>From b35340e47de896c9933c54ce617538c46cf01488 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 20 Sep 2023 17:06:10 -0700
Subject: [PATCH 1/5] Adding OpenMP Offloading Backend for C++ Parallel
 Algorithms

---
 libcxx/CMakeLists.txt                         | 14 +++
 libcxx/include/CMakeLists.txt                 |  5 +
 libcxx/include/__algorithm/pstl_backend.h     |  8 ++
 .../__algorithm/pstl_backends/gpu_backend.h   | 21 +++++
 .../pstl_backends/gpu_backends/backend.h      | 33 +++++++
 .../pstl_backends/gpu_backends/fill.h         | 59 ++++++++++++
 .../pstl_backends/gpu_backends/for_each.h     | 59 ++++++++++++
 .../pstl_backends/gpu_backends/omp_offload.h  | 91 +++++++++++++++++++
 libcxx/include/__config_site.in               |  1 +
 9 files changed, 291 insertions(+)
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backend.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index bb2898b799bcef9..43d2a448de79584 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -290,6 +290,8 @@ option(LIBCXX_HAS_WIN32_THREAD_API "Ignore auto-detection and force use of win32
 option(LIBCXX_HAS_EXTERNAL_THREAD_API
   "Build libc++ with an externalized threading API.
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
+option(LIBCXX_ENABLE_GPU_OFFLOAD 
+  "Build libc++ with support for GPU offload" OFF)
 
 if (LIBCXX_ENABLE_THREADS)
   set(LIBCXX_PSTL_CPU_BACKEND "std_thread" CACHE STRING "Which PSTL CPU backend to use")
@@ -297,6 +299,14 @@ else()
   set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
 endif()
 
+if (NOT DEFINED LIBCXX_PSTL_GPU_BACKEND)
+  if (${LIBCXX_ENABLE_GPU_OFFLOAD})
+    set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
+  else()
+    set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
+  endif()
+endif()
+
 # Misc options ----------------------------------------------------------------
 # FIXME: Turn -pedantic back ON. It is currently off because it warns
 # about #include_next which is used everywhere.
@@ -809,6 +819,10 @@ else()
                        Valid backends are: serial, std_thread and libdispatch")
 endif()
 
+if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload")
+  config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+endif()
+
 if (LIBCXX_ABI_DEFINES)
   set(abi_defines)
   foreach (abi_define ${LIBCXX_ABI_DEFINES})
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 2ec755236dbaee2..a3d72df61a86dde 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -85,6 +85,11 @@ set(files
   __algorithm/pstl_backends/cpu_backends/thread.h
   __algorithm/pstl_backends/cpu_backends/transform.h
   __algorithm/pstl_backends/cpu_backends/transform_reduce.h
+  __algorithm/pstl_backends/gpu_backend.h
+  __algorithm/pstl_backends/gpu_backends/backend.h
+  __algorithm/pstl_backends/gpu_backends/fill.h
+  __algorithm/pstl_backends/gpu_backends/for_each.h
+  __algorithm/pstl_backends/gpu_backends/omp_offload.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index 93372f019031b63..f051e0ce9be13c3 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKEND_H
 
 #include <__algorithm/pstl_backends/cpu_backend.h>
+#include <__algorithm/pstl_backends/gpu_backend.h>
 #include <__config>
 #include <execution>
 
@@ -179,10 +180,17 @@ struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
+#   if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+template <>
+struct __select_backend<std::execution::parallel_unsequenced_policy> {
+  using type = __gpu_backend_tag;
+};
+#   else
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __cpu_backend_tag;
 };
+#   endif
 
 #  else
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
new file mode 100644
index 000000000000000..46a85f77b5deb99
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+
+#include <__config>
+
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+
+#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#include <__algorithm/pstl_backends/gpu_backends/fill.h>
+#include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#endif
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
new file mode 100644
index 000000000000000..a8b400afbb94d9d
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+
+#include <__config>
+#include <cstddef>
+
+#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#  include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+struct __gpu_backend_tag {};
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
new file mode 100644
index 000000000000000..5603e18a5d2d3fc
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+
+#include <__algorithm/fill.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+#include <stdio.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
+  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // parallel unsequenced, as it is the only execution policy prohibiting throwing
+  // exceptions and allowing SIMD instructions
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+  }
+  // Else if the excution policy is parallel, we execute for_each on the CPU instead
+  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__terminate_on_exception([&] {
+      __par_backend::__parallel_for(
+          __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+            std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{}, __brick_first, __brick_last, __value);
+          });
+    });
+  // Else we execute for_each in serial
+  } else {
+    std::fill(__first, __last, __value);
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
new file mode 100644
index 000000000000000..20486d83863f420
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+
+#include <__algorithm/for_each.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+#include <stdio.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
+  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // parallel unsequenced, as it is the only execution policy prohibiting throwing
+  // exceptions and allowing SIMD instructions
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+  }
+  // Else if the excution policy is parallel, we execute for_each on the CPU instead
+  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__terminate_on_exception([&] {
+      std::__par_backend::__parallel_for(
+          __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+            std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{}, __brick_first, __brick_last, __func);
+          });
+    });
+  // Else we execute for_each in serial
+  } else {
+    std::for_each(__first, __last, __func);
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
new file mode 100644
index 000000000000000..840118dbec5057c
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -0,0 +1,91 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+
+#include <__assert>
+#include <__config>
+#include <__utility/move.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+// In OpenMP, we need to extract the pointer for the underlying data for data
+// structures like std::vector and std::array to be able to map the data to the
+// device.
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(T p) {
+  return p;
+}
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
+  std::pointer_traits<std::__wrap_iter<T>> PT;
+  return PT.to_address(w);
+}
+
+// Applying function or lambda in a loop
+
+template <class _Iterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n])
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__first[__i]);
+
+  return __first + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+  __omp_parallel_for_simd_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
+  return __first + __n;
+}
+
+// Assigning a value in a loop
+
+template <class _Index, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n]) map(to:__value)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __first[__i] = __value;
+
+  return __first + __n;
+}
+
+template <class _Index, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI _Index __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+  __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
+  return __first + __n;
+}
+
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index c85cbcd02c441b9..e0edddce3afc3ff 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -34,6 +34,7 @@
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_SERIAL
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
+#cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD
 
 // Hardening.
 #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT

>From af5ddf7709e44435c3b0b15421aa9cfc24b49e84 Mon Sep 17 00:00:00 2001
From: antonrydahl <rydahl2610 at gmail.com>
Date: Wed, 20 Sep 2023 17:48:25 -0700
Subject: [PATCH 2/5] Clang formatting OpenMP backend for parallel algorithms

---
 libcxx/include/__algorithm/pstl_backend.h           |  6 +++---
 .../include/__algorithm/pstl_backends/gpu_backend.h |  4 ++--
 .../__algorithm/pstl_backends/gpu_backends/fill.h   | 12 ++++++------
 .../pstl_backends/gpu_backends/for_each.h           | 12 ++++++------
 .../pstl_backends/gpu_backends/omp_offload.h        | 13 ++++++++-----
 5 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index f051e0ce9be13c3..47f5191b48517ba 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -180,17 +180,17 @@ struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
-#   if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#    if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __gpu_backend_tag;
 };
-#   else
+#    else
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __cpu_backend_tag;
 };
-#   endif
+#    endif
 
 #  else
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index 46a85f77b5deb99..7237036156a1bf3 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -14,8 +14,8 @@
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 
 #if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
-#include <__algorithm/pstl_backends/gpu_backends/fill.h>
-#include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#  include <__algorithm/pstl_backends/gpu_backends/fill.h>
+#  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
 #endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index 5603e18a5d2d3fc..32926da87e2a083 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
 
 #include <__algorithm/fill.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
@@ -29,16 +29,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
-  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // It is only safe to execute for_each on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
   }
   // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__terminate_on_exception([&] {
       __par_backend::__parallel_for(
           __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
@@ -46,7 +46,7 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
                 __cpu_backend_tag{}, __brick_first, __brick_last, __value);
           });
     });
-  // Else we execute for_each in serial
+    // Else we execute for_each in serial
   } else {
     std::fill(__first, __last, __value);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index 20486d83863f420..14de2af8e4a15c6 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
 
 #include <__algorithm/for_each.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__type_traits/is_execution_policy.h>
@@ -29,16 +29,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI void
 __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
-  // It is only safe to execute for_each on the GPU, it the execution policy is 
+  // It is only safe to execute for_each on the GPU, it the execution policy is
   // parallel unsequenced, as it is the only execution policy prohibiting throwing
   // exceptions and allowing SIMD instructions
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
-                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
   }
   // Else if the excution policy is parallel, we execute for_each on the CPU instead
-  else  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
-                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     std::__terminate_on_exception([&] {
       std::__par_backend::__parallel_for(
           __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
@@ -46,7 +46,7 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
                 __cpu_backend_tag{}, __brick_first, __brick_last, __func);
           });
     });
-  // Else we execute for_each in serial
+    // Else we execute for_each in serial
   } else {
     std::for_each(__first, __last, __func);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 840118dbec5057c..4baa4e7f65859d1 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -46,8 +46,9 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
 // Applying function or lambda in a loop
 
 template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n])
+_LIBCPP_HIDE_FROM_ABI _Iterator
+__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n])
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
 
@@ -65,8 +66,9 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
 // Assigning a value in a loop
 
 template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
-  #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n]) map(to:__value)
+_LIBCPP_HIDE_FROM_ABI _Index
+__omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) map(to : __value)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __first[__i] = __value;
 
@@ -74,7 +76,8 @@ _LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _Diff
 }
 
 template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+_LIBCPP_HIDE_FROM_ABI _Index
+__parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
   __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
   return __first + __n;
 }

>From 57abf3062c4e559fddd6bf173d415212b9f92e43 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 21 Sep 2023 12:50:17 -0700
Subject: [PATCH 3/5] Making PSTL GPU backend depend on CMake options rather
 than command line options

---
 libcxx/CMakeLists.txt                                 | 11 +++++------
 libcxx/include/__algorithm/pstl_backend.h             |  2 +-
 .../include/__algorithm/pstl_backends/gpu_backend.h   |  2 +-
 .../__algorithm/pstl_backends/gpu_backends/backend.h  |  8 ++++++--
 libcxx/include/__config_site.in                       |  1 +
 5 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 43d2a448de79584..7aa47caa1ca335a 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -299,12 +299,10 @@ else()
   set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
 endif()
 
-if (NOT DEFINED LIBCXX_PSTL_GPU_BACKEND)
-  if (${LIBCXX_ENABLE_GPU_OFFLOAD})
-    set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
-  else()
-    set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
-  endif()
+if (${LIBCXX_ENABLE_GPU_OFFLOAD})
+  set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
+else()
+  set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
 endif()
 
 # Misc options ----------------------------------------------------------------
@@ -819,6 +817,7 @@ else()
                        Valid backends are: serial, std_thread and libdispatch")
 endif()
 
+config_define_if(LIBCXX_ENABLE_GPU_OFFLOAD _LIBCPP_PSTL_GPU_OFFLOAD)
 if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload")
   config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
 endif()
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index 47f5191b48517ba..0df8847fd33589a 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -180,7 +180,7 @@ struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
-#    if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#    if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __gpu_backend_tag;
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index 7237036156a1bf3..d2a814b441224a5 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -13,7 +13,7 @@
 
 #include <__algorithm/pstl_backends/gpu_backends/backend.h>
 
-#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 #  include <__algorithm/pstl_backends/gpu_backends/fill.h>
 #  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
 #endif
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
index a8b400afbb94d9d..a03ad35d8d2ae3e 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
@@ -12,8 +12,12 @@
 #include <__config>
 #include <cstddef>
 
-#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
-#  include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
+#  if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#    include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#  else
+#    error Invalid PSTL GPU backend
+#  endif
 #endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index e0edddce3afc3ff..e7fb4f423079333 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -35,6 +35,7 @@
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
 #cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD
+#cmakedefine _LIBCPP_PSTL_GPU_OFFLOAD
 
 // Hardening.
 #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT

>From 51d9ed5702a46ac604bafbe2f707033639f86706 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 21 Sep 2023 17:07:58 -0700
Subject: [PATCH 4/5] Added OpenMP offloaded version of std::transform

---
 libcxx/include/CMakeLists.txt                 |   1 +
 .../__algorithm/pstl_backends/gpu_backend.h   |   1 +
 .../pstl_backends/gpu_backends/omp_offload.h  | 119 +++++++++++++++++-
 .../pstl_backends/gpu_backends/transform.h    | 117 +++++++++++++++++
 4 files changed, 233 insertions(+), 5 deletions(-)
 create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index a3d72df61a86dde..66e54cfbf1780ee 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -90,6 +90,7 @@ set(files
   __algorithm/pstl_backends/gpu_backends/fill.h
   __algorithm/pstl_backends/gpu_backends/for_each.h
   __algorithm/pstl_backends/gpu_backends/omp_offload.h
+  __algorithm/pstl_backends/gpu_backends/transform.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index d2a814b441224a5..dac26592dac5c1f 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -16,6 +16,7 @@
 #if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
 #  include <__algorithm/pstl_backends/gpu_backends/fill.h>
 #  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#  include <__algorithm/pstl_backends/gpu_backends/transform.h>
 #endif
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 4baa4e7f65859d1..69221cbb8519233 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -28,6 +28,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __par_backend {
 inline namespace __omp_gpu_backend {
 
+// Checking if a pointer is in a range
+template <typename T1, typename T2, typename T3>
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1 a, T2 p, T3 b) {
+  return false;
+}
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T* a, T* p, T* b) {
+  return std::less_equal<T*>{}(a, p) && std::less<T*>{}(p, b);
+}
+
 // In OpenMP, we need to extract the pointer for the underlying data for data
 // structures like std::vector and std::array to be able to map the data to the
 // device.
@@ -43,12 +54,16 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
   return PT.to_address(w);
 }
 
+//===----------------------------------------------------------------------===//
+// Templates for one iterator
+//===----------------------------------------------------------------------===//
+
 // Applying function or lambda in a loop
 
 template <class _Iterator, class _DifferenceType, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator
-__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n])
+__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f, const int __device = 0) noexcept {
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
 
@@ -66,9 +81,10 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
 // Assigning a value in a loop
 
 template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index
-__omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
-#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) map(to : __value)
+_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
+    _Index __first, _DifferenceType __n, const _Tp& __value, const int __device = 0) noexcept {
+#  pragma omp target teams distribute parallel for simd map(from : __first[0 : __n]) map(always, to : __value)         \
+      device(__device)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __first[__i] = __value;
 
@@ -82,6 +98,99 @@ __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __valu
   return __first + __n;
 }
 
+//===----------------------------------------------------------------------===//
+// Templates for two iterators
+//===----------------------------------------------------------------------===//
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
+    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f, const int __device = 0) noexcept {
+  if ((!std::is_same<_Iterator1, _Iterator2>::value) ||
+      (std::is_same<_Iterator1, _Iterator2>::value &&
+       !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first2, __first1 + __n))) {
+#  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n]) map(from : __first2[0 : __n])      \
+      device(__device)
+    for (_DifferenceType __i = 0; __i < __n; ++__i)
+      __f(__first1[__i], __first2[__i]);
+    return __first1 + __n;
+  }
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first1[0 : __n], __first2[0 : __n])             \
+      device(__device)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__first1[__i], __first2[__i]);
+
+  return __first1 + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1
+__parallel_for_simd_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
+  __omp_parallel_for_simd_2(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __n,
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __f);
+  return __first1 + __n;
+}
+
+//===----------------------------------------------------------------------===//
+// Templates for three iterator
+//===----------------------------------------------------------------------===//
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
+    _Iterator1 __first1,
+    _DifferenceType __n,
+    _Iterator2 __first2,
+    _Iterator3 __first3,
+    _Function __f,
+    const int __device = 0) noexcept {
+  // It may be that __first3 is in the interval [__first1+__n) or [__firt2+__n)
+  // It is, however, undefined behavior to compare two pointers that do not
+  // point to the same object or are not the same type.
+  // If we can prove that __first3 is not in any of the ranges [__first1+__n)
+  // or [__firt2+__n), it is safe to reduce the amount of data copied to and
+  // from the device
+  constexpr bool are_not_same_type =
+      !std::is_same<_Iterator1, _Iterator2>::value && !std::is_same<_Iterator1, _Iterator3>::value;
+  const bool no_overlap_13 =
+      std::is_same<_Iterator1, _Iterator3>::value &&
+      !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first3, __first1 + __n);
+  const bool no_overlap_23 =
+      std::is_same<_Iterator2, _Iterator3>::value &&
+      !__omp_gpu_backend::__omp_in_ptr_range(__first2, __first3, __first2 + __n);
+  if (are_not_same_type || (no_overlap_13 && no_overlap_23)) {
+#  pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n], __first2[0 : __n])                 \
+      map(from : __first3[0 : __n]) device(__device)
+    for (_DifferenceType __i = 0; __i < __n; ++__i)
+      __f(__first1[__i], __first2[__i], __first3[__i]);
+    return __first1 + __n;
+  }
+  // In the general case, we have to map all data to and from the device
+#  pragma omp target teams distribute parallel for simd map(                                                           \
+          tofrom : __first1[0 : __n], __first2[0 : __n], __first3[0 : __n]) device(__device)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__first1[__i], __first2[__i], __first3[__i]);
+
+  return __first1 + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
+    _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
+  __omp_parallel_for_simd_3(
+      __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+      __n,
+      __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+      __omp_gpu_backend::__omp_extract_base_ptr(__first3),
+      __f);
+  return __first1 + __n;
+}
+
 } // namespace __omp_gpu_backend
 } // namespace __par_backend
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
new file mode 100644
index 000000000000000..03eba11a3f5f52b
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -0,0 +1,117 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/transform.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__type_traits/remove_cvref.h>
+#include <__utility/terminate_on_exception.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+    __gpu_backend_tag,
+    _ForwardIterator __first,
+    _ForwardIterator __last,
+    _ForwardOutIterator __result,
+    _UnaryOperation __op) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    return std::__par_backend::__parallel_for_simd_2(
+        __first,
+        __last - __first,
+        __result,
+        [&](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
+          __out_value = __op(__in_value);
+        });
+  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    std::__terminate_on_exception([&] {
+      std::__par_backend::__parallel_for(
+          __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+            return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op);
+          });
+    });
+    return __result + (__last - __first);
+  } else {
+    return std::transform(__first, __last, __result, __op);
+  }
+}
+
+template <class _ExecutionPolicy,
+          class _ForwardIterator1,
+          class _ForwardIterator2,
+          class _ForwardOutIterator,
+          class _BinaryOperation,
+          enable_if_t<is_execution_policy_v<__remove_cvref_t<_ExecutionPolicy>>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+    __gpu_backend_tag,
+    _ForwardIterator1 __first1,
+    _ForwardIterator1 __last1,
+    _ForwardIterator2 __first2,
+    _ForwardOutIterator __result,
+    _BinaryOperation __op) {
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    return std::__par_backend::__parallel_for_simd_3(
+        __first1,
+        __last1 - __first1,
+        __first2,
+        __result,
+        [&](__iter_reference<_ForwardIterator1> __in1,
+            __iter_reference<_ForwardIterator2> __in2,
+            __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); });
+  } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+                       __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    std::__terminate_on_exception([&] {
+      std::__par_backend::__parallel_for(
+          __first1,
+          __last1,
+          [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) {
+            return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{},
+                __brick_first,
+                __brick_last,
+                __first2 + (__brick_first - __first1),
+                __result + (__brick_first - __first1),
+                __op);
+          });
+    });
+    return __result + (__last1 - __first1);
+  } else {
+    return std::transform(__first1, __last1, __first2, __result, __op);
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H

>From 33b61efe005cb12247cdf89ce7b7a4c5ca849f5b Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 22 Sep 2023 11:55:53 -0700
Subject: [PATCH 5/5] Changing lambdas to capture by value in std::transform
 for GPUs

---
 .../__algorithm/pstl_backends/gpu_backends/transform.h    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index 03eba11a3f5f52b..7fcfde44aaaa7a6 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -38,11 +38,13 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
   if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    // While the CPU backend captures by reference, [&], that is not valid when
+    // offloading to the GPU. Therefore we must capture by value, [=].
     return std::__par_backend::__parallel_for_simd_2(
         __first,
         __last - __first,
         __result,
-        [&](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
+        [=](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
           __out_value = __op(__in_value);
         });
   } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
@@ -78,12 +80,14 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+    // While the CPU backend captures by reference, [&], that is not valid when
+    // offloading to the GPU. Therefore we must capture by value, [=].
     return std::__par_backend::__parallel_for_simd_3(
         __first1,
         __last1 - __first1,
         __first2,
         __result,
-        [&](__iter_reference<_ForwardIterator1> __in1,
+        [=](__iter_reference<_ForwardIterator1> __in1,
             __iter_reference<_ForwardIterator2> __in2,
             __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); });
   } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&