[clang] Adding Separate OpenMP Offloading Backend to `libcxx/include/__algorithm/pstl_backends` (PR #66968)
Anton Rydahl via cfe-commits
cfe-commits at lists.llvm.org
Mon Oct 2 12:29:46 PDT 2023
https://github.com/AntonRydahl updated https://github.com/llvm/llvm-project/pull/66968
>From b35340e47de896c9933c54ce617538c46cf01488 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 20 Sep 2023 17:06:10 -0700
Subject: [PATCH 01/10] Adding OpenMP Offloading Backend for C++ Parallel
Algorithms
---
libcxx/CMakeLists.txt | 14 +++
libcxx/include/CMakeLists.txt | 5 +
libcxx/include/__algorithm/pstl_backend.h | 8 ++
.../__algorithm/pstl_backends/gpu_backend.h | 21 +++++
.../pstl_backends/gpu_backends/backend.h | 33 +++++++
.../pstl_backends/gpu_backends/fill.h | 59 ++++++++++++
.../pstl_backends/gpu_backends/for_each.h | 59 ++++++++++++
.../pstl_backends/gpu_backends/omp_offload.h | 91 +++++++++++++++++++
libcxx/include/__config_site.in | 1 +
9 files changed, 291 insertions(+)
create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backend.h
create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index bb2898b799bcef9..43d2a448de79584 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -290,6 +290,8 @@ option(LIBCXX_HAS_WIN32_THREAD_API "Ignore auto-detection and force use of win32
option(LIBCXX_HAS_EXTERNAL_THREAD_API
"Build libc++ with an externalized threading API.
This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
+option(LIBCXX_ENABLE_GPU_OFFLOAD
+ "Build libc++ with support for GPU offload" OFF)
if (LIBCXX_ENABLE_THREADS)
set(LIBCXX_PSTL_CPU_BACKEND "std_thread" CACHE STRING "Which PSTL CPU backend to use")
@@ -297,6 +299,14 @@ else()
set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
endif()
+if (NOT DEFINED LIBCXX_PSTL_GPU_BACKEND)
+ if (${LIBCXX_ENABLE_GPU_OFFLOAD})
+ set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
+ else()
+ set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
+ endif()
+endif()
+
# Misc options ----------------------------------------------------------------
# FIXME: Turn -pedantic back ON. It is currently off because it warns
# about #include_next which is used everywhere.
@@ -809,6 +819,10 @@ else()
Valid backends are: serial, std_thread and libdispatch")
endif()
+if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload")
+ config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+endif()
+
if (LIBCXX_ABI_DEFINES)
set(abi_defines)
foreach (abi_define ${LIBCXX_ABI_DEFINES})
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 2ec755236dbaee2..a3d72df61a86dde 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -85,6 +85,11 @@ set(files
__algorithm/pstl_backends/cpu_backends/thread.h
__algorithm/pstl_backends/cpu_backends/transform.h
__algorithm/pstl_backends/cpu_backends/transform_reduce.h
+ __algorithm/pstl_backends/gpu_backend.h
+ __algorithm/pstl_backends/gpu_backends/backend.h
+ __algorithm/pstl_backends/gpu_backends/fill.h
+ __algorithm/pstl_backends/gpu_backends/for_each.h
+ __algorithm/pstl_backends/gpu_backends/omp_offload.h
__algorithm/pstl_copy.h
__algorithm/pstl_count.h
__algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index 93372f019031b63..f051e0ce9be13c3 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -10,6 +10,7 @@
#define _LIBCPP___ALGORITHM_PSTL_BACKEND_H
#include <__algorithm/pstl_backends/cpu_backend.h>
+#include <__algorithm/pstl_backends/gpu_backend.h>
#include <__config>
#include <execution>
@@ -179,10 +180,17 @@ struct __select_backend<std::execution::parallel_policy> {
using type = __cpu_backend_tag;
};
+# if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+template <>
+struct __select_backend<std::execution::parallel_unsequenced_policy> {
+ using type = __gpu_backend_tag;
+};
+# else
template <>
struct __select_backend<std::execution::parallel_unsequenced_policy> {
using type = __cpu_backend_tag;
};
+# endif
# else
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
new file mode 100644
index 000000000000000..46a85f77b5deb99
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+
+#include <__config>
+
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+
+#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#include <__algorithm/pstl_backends/gpu_backends/fill.h>
+#include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#endif
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
new file mode 100644
index 000000000000000..a8b400afbb94d9d
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+
+#include <__config>
+#include <cstddef>
+
+#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+# include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+struct __gpu_backend_tag {};
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
new file mode 100644
index 000000000000000..5603e18a5d2d3fc
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+
+#include <__algorithm/fill.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+#include <stdio.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
+ // It is only safe to execute for_each on the GPU, it the execution policy is
+ // parallel unsequenced, as it is the only execution policy prohibiting throwing
+ // exceptions and allowing SIMD instructions
+ if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+ std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+ }
+ // Else if the excution policy is parallel, we execute for_each on the CPU instead
+ else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+ std::__terminate_on_exception([&] {
+ __par_backend::__parallel_for(
+ __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+ std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
+ __cpu_backend_tag{}, __brick_first, __brick_last, __value);
+ });
+ });
+ // Else we execute for_each in serial
+ } else {
+ std::fill(__first, __last, __value);
+ }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
new file mode 100644
index 000000000000000..20486d83863f420
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+
+#include <__algorithm/for_each.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+#include <stdio.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
+ // It is only safe to execute for_each on the GPU, it the execution policy is
+ // parallel unsequenced, as it is the only execution policy prohibiting throwing
+ // exceptions and allowing SIMD instructions
+ if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+ std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+ }
+ // Else if the excution policy is parallel, we execute for_each on the CPU instead
+ else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+ std::__terminate_on_exception([&] {
+ std::__par_backend::__parallel_for(
+ __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+ std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
+ __cpu_backend_tag{}, __brick_first, __brick_last, __func);
+ });
+ });
+ // Else we execute for_each in serial
+ } else {
+ std::for_each(__first, __last, __func);
+ }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
new file mode 100644
index 000000000000000..840118dbec5057c
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -0,0 +1,91 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+
+#include <__assert>
+#include <__config>
+#include <__utility/move.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+// In OpenMP, we need to extract the pointer for the underlying data for data
+// structures like std::vector and std::array to be able to map the data to the
+// device.
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(T p) {
+ return p;
+}
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
+ std::pointer_traits<std::__wrap_iter<T>> PT;
+ return PT.to_address(w);
+}
+
+// Applying function or lambda in a loop
+
+template <class _Iterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+ #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n])
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ __f(__first[__i]);
+
+ return __first + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+ __omp_parallel_for_simd_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
+ return __first + __n;
+}
+
+// Assigning a value in a loop
+
+template <class _Index, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+ #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n]) map(to:__value)
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ __first[__i] = __value;
+
+ return __first + __n;
+}
+
+template <class _Index, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI _Index __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+ __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
+ return __first + __n;
+}
+
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index c85cbcd02c441b9..e0edddce3afc3ff 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -34,6 +34,7 @@
#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_SERIAL
#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
+#cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD
// Hardening.
#cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT
>From af5ddf7709e44435c3b0b15421aa9cfc24b49e84 Mon Sep 17 00:00:00 2001
From: antonrydahl <rydahl2610 at gmail.com>
Date: Wed, 20 Sep 2023 17:48:25 -0700
Subject: [PATCH 02/10] Clang formatting OpenMP backend for parallel algorithms
---
libcxx/include/__algorithm/pstl_backend.h | 6 +++---
.../include/__algorithm/pstl_backends/gpu_backend.h | 4 ++--
.../__algorithm/pstl_backends/gpu_backends/fill.h | 12 ++++++------
.../pstl_backends/gpu_backends/for_each.h | 12 ++++++------
.../pstl_backends/gpu_backends/omp_offload.h | 13 ++++++++-----
5 files changed, 25 insertions(+), 22 deletions(-)
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index f051e0ce9be13c3..47f5191b48517ba 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -180,17 +180,17 @@ struct __select_backend<std::execution::parallel_policy> {
using type = __cpu_backend_tag;
};
-# if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+# if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
template <>
struct __select_backend<std::execution::parallel_unsequenced_policy> {
using type = __gpu_backend_tag;
};
-# else
+# else
template <>
struct __select_backend<std::execution::parallel_unsequenced_policy> {
using type = __cpu_backend_tag;
};
-# endif
+# endif
# else
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index 46a85f77b5deb99..7237036156a1bf3 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -14,8 +14,8 @@
#include <__algorithm/pstl_backends/gpu_backends/backend.h>
#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
-#include <__algorithm/pstl_backends/gpu_backends/fill.h>
-#include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+# include <__algorithm/pstl_backends/gpu_backends/fill.h>
+# include <__algorithm/pstl_backends/gpu_backends/for_each.h>
#endif
#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index 5603e18a5d2d3fc..32926da87e2a083 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -10,8 +10,8 @@
#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
#include <__algorithm/fill.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
#include <__config>
#include <__iterator/concepts.h>
#include <__type_traits/is_execution_policy.h>
@@ -29,16 +29,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD
template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
_LIBCPP_HIDE_FROM_ABI void
__pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
- // It is only safe to execute for_each on the GPU, it the execution policy is
+ // It is only safe to execute for_each on the GPU, it the execution policy is
// parallel unsequenced, as it is the only execution policy prohibiting throwing
// exceptions and allowing SIMD instructions
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
}
// Else if the excution policy is parallel, we execute for_each on the CPU instead
- else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+ else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
std::__terminate_on_exception([&] {
__par_backend::__parallel_for(
__first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
@@ -46,7 +46,7 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
__cpu_backend_tag{}, __brick_first, __brick_last, __value);
});
});
- // Else we execute for_each in serial
+ // Else we execute for_each in serial
} else {
std::fill(__first, __last, __value);
}
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index 20486d83863f420..14de2af8e4a15c6 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -10,8 +10,8 @@
#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
#include <__algorithm/for_each.h>
-#include <__algorithm/pstl_backends/gpu_backends/backend.h>
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
#include <__config>
#include <__iterator/concepts.h>
#include <__type_traits/is_execution_policy.h>
@@ -29,16 +29,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD
template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
_LIBCPP_HIDE_FROM_ABI void
__pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
- // It is only safe to execute for_each on the GPU, it the execution policy is
+ // It is only safe to execute for_each on the GPU, it the execution policy is
// parallel unsequenced, as it is the only execution policy prohibiting throwing
// exceptions and allowing SIMD instructions
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
}
// Else if the excution policy is parallel, we execute for_each on the CPU instead
- else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+ else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
std::__terminate_on_exception([&] {
std::__par_backend::__parallel_for(
__first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
@@ -46,7 +46,7 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
__cpu_backend_tag{}, __brick_first, __brick_last, __func);
});
});
- // Else we execute for_each in serial
+ // Else we execute for_each in serial
} else {
std::for_each(__first, __last, __func);
}
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 840118dbec5057c..4baa4e7f65859d1 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -46,8 +46,9 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
// Applying function or lambda in a loop
template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
- #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n])
+_LIBCPP_HIDE_FROM_ABI _Iterator
+__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+# pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n])
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__first[__i]);
@@ -65,8 +66,9 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
// Assigning a value in a loop
template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
- #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n]) map(to:__value)
+_LIBCPP_HIDE_FROM_ABI _Index
+__omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+# pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) map(to : __value)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__first[__i] = __value;
@@ -74,7 +76,8 @@ _LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _Diff
}
template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+_LIBCPP_HIDE_FROM_ABI _Index
+__parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
__omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
return __first + __n;
}
>From 57abf3062c4e559fddd6bf173d415212b9f92e43 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 21 Sep 2023 12:50:17 -0700
Subject: [PATCH 03/10] Making PSTL GPU backend depend on CMake options rather
than command line options
---
libcxx/CMakeLists.txt | 11 +++++------
libcxx/include/__algorithm/pstl_backend.h | 2 +-
.../include/__algorithm/pstl_backends/gpu_backend.h | 2 +-
.../__algorithm/pstl_backends/gpu_backends/backend.h | 8 ++++++--
libcxx/include/__config_site.in | 1 +
5 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 43d2a448de79584..7aa47caa1ca335a 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -299,12 +299,10 @@ else()
set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
endif()
-if (NOT DEFINED LIBCXX_PSTL_GPU_BACKEND)
- if (${LIBCXX_ENABLE_GPU_OFFLOAD})
- set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
- else()
- set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
- endif()
+if (${LIBCXX_ENABLE_GPU_OFFLOAD})
+ set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
+else()
+ set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
endif()
# Misc options ----------------------------------------------------------------
@@ -819,6 +817,7 @@ else()
Valid backends are: serial, std_thread and libdispatch")
endif()
+config_define_if(LIBCXX_ENABLE_GPU_OFFLOAD _LIBCPP_PSTL_GPU_OFFLOAD)
if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload")
config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
endif()
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index 47f5191b48517ba..0df8847fd33589a 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -180,7 +180,7 @@ struct __select_backend<std::execution::parallel_policy> {
using type = __cpu_backend_tag;
};
-# if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+# if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
template <>
struct __select_backend<std::execution::parallel_unsequenced_policy> {
using type = __gpu_backend_tag;
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index 7237036156a1bf3..d2a814b441224a5 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -13,7 +13,7 @@
#include <__algorithm/pstl_backends/gpu_backends/backend.h>
-#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
# include <__algorithm/pstl_backends/gpu_backends/fill.h>
# include <__algorithm/pstl_backends/gpu_backends/for_each.h>
#endif
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
index a8b400afbb94d9d..a03ad35d8d2ae3e 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
@@ -12,8 +12,12 @@
#include <__config>
#include <cstddef>
-#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
-# include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
+# if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+# include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+# else
+# error Invalid PSTL GPU backend
+# endif
#endif
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index e0edddce3afc3ff..e7fb4f423079333 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -35,6 +35,7 @@
#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
#cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD
+#cmakedefine _LIBCPP_PSTL_GPU_OFFLOAD
// Hardening.
#cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT
>From 51d9ed5702a46ac604bafbe2f707033639f86706 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Thu, 21 Sep 2023 17:07:58 -0700
Subject: [PATCH 04/10] Added OpenMP offloaded version of std::transform
---
libcxx/include/CMakeLists.txt | 1 +
.../__algorithm/pstl_backends/gpu_backend.h | 1 +
.../pstl_backends/gpu_backends/omp_offload.h | 119 +++++++++++++++++-
.../pstl_backends/gpu_backends/transform.h | 117 +++++++++++++++++
4 files changed, 233 insertions(+), 5 deletions(-)
create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index a3d72df61a86dde..66e54cfbf1780ee 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -90,6 +90,7 @@ set(files
__algorithm/pstl_backends/gpu_backends/fill.h
__algorithm/pstl_backends/gpu_backends/for_each.h
__algorithm/pstl_backends/gpu_backends/omp_offload.h
+ __algorithm/pstl_backends/gpu_backends/transform.h
__algorithm/pstl_copy.h
__algorithm/pstl_count.h
__algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index d2a814b441224a5..dac26592dac5c1f 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -16,6 +16,7 @@
#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
# include <__algorithm/pstl_backends/gpu_backends/fill.h>
# include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+# include <__algorithm/pstl_backends/gpu_backends/transform.h>
#endif
#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 4baa4e7f65859d1..69221cbb8519233 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -28,6 +28,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
namespace __par_backend {
inline namespace __omp_gpu_backend {
+// Checking if a pointer is in a range
+template <typename T1, typename T2, typename T3>
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1 a, T2 p, T3 b) {
+ return false;
+}
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T* a, T* p, T* b) {
+ return std::less_equal<T*>{}(a, p) && std::less<T*>{}(p, b);
+}
+
// In OpenMP, we need to extract the pointer for the underlying data for data
// structures like std::vector and std::array to be able to map the data to the
// device.
@@ -43,12 +54,16 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
return PT.to_address(w);
}
+//===----------------------------------------------------------------------===//
+// Templates for one iterator
+//===----------------------------------------------------------------------===//
+
// Applying function or lambda in a loop
template <class _Iterator, class _DifferenceType, class _Function>
_LIBCPP_HIDE_FROM_ABI _Iterator
-__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-# pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n])
+__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f, const int __device = 0) noexcept {
+# pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) device(__device)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__first[__i]);
@@ -66,9 +81,10 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
// Assigning a value in a loop
template <class _Index, class _DifferenceType, class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Index
-__omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
-# pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) map(to : __value)
+_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
+ _Index __first, _DifferenceType __n, const _Tp& __value, const int __device = 0) noexcept {
+# pragma omp target teams distribute parallel for simd map(from : __first[0 : __n]) map(always, to : __value) \
+ device(__device)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__first[__i] = __value;
@@ -82,6 +98,99 @@ __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __valu
return __first + __n;
}
+//===----------------------------------------------------------------------===//
+// Templates for two iterators
+//===----------------------------------------------------------------------===//
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
+ _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f, const int __device = 0) noexcept {
+ if ((!std::is_same<_Iterator1, _Iterator2>::value) ||
+ (std::is_same<_Iterator1, _Iterator2>::value &&
+ !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first2, __first1 + __n))) {
+# pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n]) map(from : __first2[0 : __n]) \
+ device(__device)
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ __f(__first1[__i], __first2[__i]);
+ return __first1 + __n;
+ }
+# pragma omp target teams distribute parallel for simd map(tofrom : __first1[0 : __n], __first2[0 : __n]) \
+ device(__device)
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ __f(__first1[__i], __first2[__i]);
+
+ return __first1 + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1
+__parallel_for_simd_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
+ __omp_parallel_for_simd_2(
+ __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+ __n,
+ __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+ __f);
+ return __first1 + __n;
+}
+
+//===----------------------------------------------------------------------===//
+// Templates for three iterator
+//===----------------------------------------------------------------------===//
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
+ _Iterator1 __first1,
+ _DifferenceType __n,
+ _Iterator2 __first2,
+ _Iterator3 __first3,
+ _Function __f,
+ const int __device = 0) noexcept {
+ // It may be that __first3 is in the interval [__first1+__n) or [__firt2+__n)
+ // It is, however, undefined behavior to compare two pointers that do not
+ // point to the same object or are not the same type.
+ // If we can prove that __first3 is not in any of the ranges [__first1+__n)
+ // or [__firt2+__n), it is safe to reduce the amount of data copied to and
+ // from the device
+ constexpr bool are_not_same_type =
+ !std::is_same<_Iterator1, _Iterator2>::value && !std::is_same<_Iterator1, _Iterator3>::value;
+ const bool no_overlap_13 =
+ std::is_same<_Iterator1, _Iterator3>::value &&
+ !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first3, __first1 + __n);
+ const bool no_overlap_23 =
+ std::is_same<_Iterator2, _Iterator3>::value &&
+ !__omp_gpu_backend::__omp_in_ptr_range(__first2, __first3, __first2 + __n);
+ if (are_not_same_type || (no_overlap_13 && no_overlap_23)) {
+# pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n], __first2[0 : __n]) \
+ map(from : __first3[0 : __n]) device(__device)
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ __f(__first1[__i], __first2[__i], __first3[__i]);
+ return __first1 + __n;
+ }
+ // In the general case, we have to map all data to and from the device
+# pragma omp target teams distribute parallel for simd map( \
+ tofrom : __first1[0 : __n], __first2[0 : __n], __first3[0 : __n]) device(__device)
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ __f(__first1[__i], __first2[__i], __first3[__i]);
+
+ return __first1 + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
+ _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
+ __omp_parallel_for_simd_3(
+ __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+ __n,
+ __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+ __omp_gpu_backend::__omp_extract_base_ptr(__first3),
+ __f);
+ return __first1 + __n;
+}
+
} // namespace __omp_gpu_backend
} // namespace __par_backend
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
new file mode 100644
index 000000000000000..03eba11a3f5f52b
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -0,0 +1,117 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
+
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/transform.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__type_traits/remove_cvref.h>
+#include <__utility/terminate_on_exception.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+ __gpu_backend_tag,
+ _ForwardIterator __first,
+ _ForwardIterator __last,
+ _ForwardOutIterator __result,
+ _UnaryOperation __op) {
+ if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+ __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+ return std::__par_backend::__parallel_for_simd_2(
+ __first,
+ __last - __first,
+ __result,
+ [&](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
+ __out_value = __op(__in_value);
+ });
+ } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+ __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+ std::__terminate_on_exception([&] {
+ std::__par_backend::__parallel_for(
+ __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+ return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
+ __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op);
+ });
+ });
+ return __result + (__last - __first);
+ } else {
+ return std::transform(__first, __last, __result, __op);
+ }
+}
+
+template <class _ExecutionPolicy,
+ class _ForwardIterator1,
+ class _ForwardIterator2,
+ class _ForwardOutIterator,
+ class _BinaryOperation,
+ enable_if_t<is_execution_policy_v<__remove_cvref_t<_ExecutionPolicy>>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
+ __gpu_backend_tag,
+ _ForwardIterator1 __first1,
+ _ForwardIterator1 __last1,
+ _ForwardIterator2 __first2,
+ _ForwardOutIterator __result,
+ _BinaryOperation __op) {
+ if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+ __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+ return std::__par_backend::__parallel_for_simd_3(
+ __first1,
+ __last1 - __first1,
+ __first2,
+ __result,
+ [&](__iter_reference<_ForwardIterator1> __in1,
+ __iter_reference<_ForwardIterator2> __in2,
+ __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); });
+ } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+ __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+ std::__terminate_on_exception([&] {
+ std::__par_backend::__parallel_for(
+ __first1,
+ __last1,
+ [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) {
+ return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
+ __cpu_backend_tag{},
+ __brick_first,
+ __brick_last,
+ __first2 + (__brick_first - __first1),
+ __result + (__brick_first - __first1),
+ __op);
+ });
+ });
+ return __result + (__last1 - __first1);
+ } else {
+ return std::transform(__first1, __last1, __first2, __result, __op);
+ }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H
>From 33b61efe005cb12247cdf89ce7b7a4c5ca849f5b Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Fri, 22 Sep 2023 11:55:53 -0700
Subject: [PATCH 05/10] Changing lambdas to capture by value in std::transform
for GPUs
---
.../__algorithm/pstl_backends/gpu_backends/transform.h | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index 03eba11a3f5f52b..7fcfde44aaaa7a6 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -38,11 +38,13 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
__has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+ // While the CPU backend captures by reference, [&], that is not valid when
+ // offloading to the GPU. Therefore we must capture by value, [=].
return std::__par_backend::__parallel_for_simd_2(
__first,
__last - __first,
__result,
- [&](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
+ [=](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
__out_value = __op(__in_value);
});
} else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
@@ -78,12 +80,14 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
__has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
__has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
__has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+ // While the CPU backend captures by reference, [&], that is not valid when
+ // offloading to the GPU. Therefore we must capture by value, [=].
return std::__par_backend::__parallel_for_simd_3(
__first1,
__last1 - __first1,
__first2,
__result,
- [&](__iter_reference<_ForwardIterator1> __in1,
+ [=](__iter_reference<_ForwardIterator1> __in1,
__iter_reference<_ForwardIterator2> __in2,
__iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); });
} else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
>From 065f52d742a4e490b9b0441b9d997fe8e4268a8a Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Mon, 25 Sep 2023 13:13:39 -0700
Subject: [PATCH 06/10] GPU Offloading Implementation of std::transform_reduce
---
libcxx/include/CMakeLists.txt | 1 +
.../__algorithm/pstl_backends/gpu_backend.h | 1 +
.../pstl_backends/gpu_backends/omp_offload.h | 113 ++++++++++++++
.../gpu_backends/transform_reduce.h | 147 ++++++++++++++++++
4 files changed, 262 insertions(+)
create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 66e54cfbf1780ee..ea00d3fdaea2924 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -91,6 +91,7 @@ set(files
__algorithm/pstl_backends/gpu_backends/for_each.h
__algorithm/pstl_backends/gpu_backends/omp_offload.h
__algorithm/pstl_backends/gpu_backends/transform.h
+ __algorithm/pstl_backends/gpu_backends/transform_reduce.h
__algorithm/pstl_copy.h
__algorithm/pstl_count.h
__algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index dac26592dac5c1f..ea7f39dea905474 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -17,6 +17,7 @@
# include <__algorithm/pstl_backends/gpu_backends/fill.h>
# include <__algorithm/pstl_backends/gpu_backends/for_each.h>
# include <__algorithm/pstl_backends/gpu_backends/transform.h>
+# include <__algorithm/pstl_backends/gpu_backends/transform_reduce.h>
#endif
#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 69221cbb8519233..d1cc6133f8e0876 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -191,6 +191,119 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
return __first1 + __n;
}
+//===----------------------------------------------------------------------===//
+// Templates for reductions
+//===----------------------------------------------------------------------===//
+
+// General case
+
+# define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op) \
+ template <class _Iterator, \
+ class _DifferenceType, \
+ typename _Tp, \
+ typename _BinaryOperationType, \
+ typename _UnaryOperation, \
+ __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 > \
+ _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1( \
+ _Iterator __first, \
+ _DifferenceType __n, \
+ _Tp __init, \
+ std_op<_BinaryOperationType> __reduce, \
+ _UnaryOperation __transform, \
+ const int __device = 0) noexcept { \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n]) device(__device)) \
+ for (_DifferenceType __i = 0; __i < __n; ++__i) \
+ __init = __reduce(__init, __transform(__first[__i])); \
+ return __init; \
+ }
+
+# define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op) \
+ template <class _Iterator1, \
+ class _Iterator2, \
+ class _DifferenceType, \
+ typename _Tp, \
+ typename _BinaryOperationType, \
+ typename _UnaryOperation, \
+ __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 > \
+ _LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2( \
+ _Iterator1 __first1, \
+ _Iterator2 __first2, \
+ _DifferenceType __n, \
+ _Tp __init, \
+ std_op<_BinaryOperationType> __reduce, \
+ _UnaryOperation __transform, \
+ const int __device = 0) noexcept { \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]) device(__device)) \
+ for (_DifferenceType __i = 0; __i < __n; ++__i) \
+ __init = __reduce(__init, __transform(__first1[__i], __first2[__i])); \
+ return __init; \
+ }
+
+# define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op) \
+ __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op) \
+ __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op)
+
+// Addition
+__PSTL_OMP_SIMD_REDUCTION(+, std::plus)
+
+// Subtraction
+__PSTL_OMP_SIMD_REDUCTION(-, std::minus)
+
+// Multiplication
+__PSTL_OMP_SIMD_REDUCTION(*, std::multiplies)
+
+// Logical and
+__PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and)
+
+// Logical or
+__PSTL_OMP_SIMD_REDUCTION(||, std::logical_or)
+
+// Bitwise and
+__PSTL_OMP_SIMD_REDUCTION(&, std::bit_and)
+
+// Bitwise or
+__PSTL_OMP_SIMD_REDUCTION(|, std::bit_or)
+
+// Bitwise xor
+__PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
+
+// Extracting the underlying pointers
+
+template <class _Iterator, class _DifferenceType, typename _Tp, typename _BinaryOperation, typename _UnaryOperation >
+_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
+ _Iterator __first,
+ _DifferenceType __n,
+ _Tp __init,
+ _BinaryOperation __reduce,
+ _UnaryOperation __transform,
+ const int __device = 0) noexcept {
+ return __omp_parallel_for_simd_reduction_1(
+ __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
+}
+
+template <class _Iterator1,
+ class _Iterator2,
+ class _DifferenceType,
+ typename _Tp,
+ typename _BinaryOperation,
+ typename _UnaryOperation >
+_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_2(
+ _Iterator1 __first1,
+ _Iterator2 __first2,
+ _DifferenceType __n,
+ _Tp __init,
+ _BinaryOperation __reduce,
+ _UnaryOperation __transform,
+ const int __device = 0) noexcept {
+ return __omp_parallel_for_simd_reduction_2(
+ __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+ __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+ __n,
+ __init,
+ __reduce,
+ __transform);
+}
+
} // namespace __omp_gpu_backend
} // namespace __par_backend
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
new file mode 100644
index 000000000000000..43e5631aef04afb
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -0,0 +1,147 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
+
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__numeric/transform_reduce.h>
+#include <__type_traits/is_arithmetic.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__type_traits/operation_traits.h>
+#include <__utility/move.h>
+#include <__utility/terminate_on_exception.h>
+#include <new>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+//===----------------------------------------------------------------------===//
+// Two input iterators
+//===----------------------------------------------------------------------===//
+
+template <class _ExecutionPolicy,
+ class _ForwardIterator1,
+ class _ForwardIterator2,
+ class _Tp,
+ class _BinaryOperation1,
+ class _BinaryOperation2>
+_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
+ __gpu_backend_tag,
+ _ForwardIterator1 __first1,
+ _ForwardIterator1 __last1,
+ _ForwardIterator2 __first2,
+ _Tp __init,
+ _BinaryOperation1 __reduce,
+ _BinaryOperation2 __transform) {
+ if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
+ return std::__par_backend::__parallel_for_simd_reduction_2(
+ std::move(__first1),
+ std::move(__first2),
+ __last1 - __first1,
+ std::move(__init),
+ std::move(__reduce),
+ [=](__iter_reference<_ForwardIterator1> __in_value_1, __iter_reference<_ForwardIterator1> __in_value_2) {
+ return __transform(__in_value_1, __in_value_2);
+ });
+ } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
+ return std::__terminate_on_exception([&] {
+ return __par_backend::__parallel_transform_reduce(
+ __first1,
+ std::move(__last1),
+ [__first1, __first2, __transform](_ForwardIterator1 __iter) {
+ return __transform(*__iter, *(__first2 + (__iter - __first1)));
+ },
+ std::move(__init),
+ std::move(__reduce),
+ [__first1, __first2, __reduce, __transform](
+ _ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last, _Tp __brick_init) {
+ return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
+ __cpu_backend_tag{},
+ __brick_first,
+ std::move(__brick_last),
+ __first2 + (__brick_first - __first1),
+ std::move(__brick_init),
+ std::move(__reduce),
+ std::move(__transform));
+ });
+ });
+ } else {
+ return std::transform_reduce(
+ std::move(__first1),
+ std::move(__last1),
+ std::move(__first2),
+ std::move(__init),
+ std::move(__reduce),
+ std::move(__transform));
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// One input iterator
+//===----------------------------------------------------------------------===//
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
+_LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
+ __gpu_backend_tag,
+ _ForwardIterator __first,
+ _ForwardIterator __last,
+ _Tp __init,
+ _BinaryOperation __reduce,
+ _UnaryOperation __transform) {
+ if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+ return std::__par_backend::__parallel_for_simd_reduction_1(
+ std::move(__first),
+ __last - __first,
+ std::move(__init),
+ std::move(__reduce),
+ [=](__iter_reference<_ForwardIterator> __in_value) { return __transform(__in_value); });
+ } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+ return std::__terminate_on_exception([&] {
+ return __par_backend::__parallel_transform_reduce(
+ std::move(__first),
+ std::move(__last),
+ [__transform](_ForwardIterator __iter) { return __transform(*__iter); },
+ std::move(__init),
+ __reduce,
+ [__transform, __reduce](auto __brick_first, auto __brick_last, _Tp __brick_init) {
+ return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
+ __cpu_backend_tag{},
+ std::move(__brick_first),
+ std::move(__brick_last),
+ std::move(__brick_init),
+ std::move(__reduce),
+ std::move(__transform));
+ });
+ });
+ } else {
+ return std::transform_reduce(
+ std::move(__first), std::move(__last), std::move(__init), std::move(__reduce), std::move(__transform));
+ }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_REDUCE_H
>From 67ecdee8fc6fcf94873486858445cf1a7b37c7dc Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Tue, 26 Sep 2023 19:55:12 -0700
Subject: [PATCH 07/10] Fixed almost all test cases that failed during ninja
check-cxx
---
libcxx/include/CMakeLists.txt | 4 +
.../__algorithm/pstl_backends/gpu_backend.h | 4 +
.../pstl_backends/gpu_backends/any_of.h | 41 +++++++
.../pstl_backends/gpu_backends/fill.h | 20 +---
.../pstl_backends/gpu_backends/find_if.h | 44 ++++++++
.../pstl_backends/gpu_backends/for_each.h | 17 +--
.../pstl_backends/gpu_backends/merge.h | 51 +++++++++
.../pstl_backends/gpu_backends/omp_offload.h | 94 +++++++++-------
.../pstl_backends/gpu_backends/stable_sort.h | 38 +++++++
.../pstl_backends/gpu_backends/transform.h | 61 ++--------
.../gpu_backends/transform_reduce.h | 105 ++++++------------
11 files changed, 292 insertions(+), 187 deletions(-)
create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index ea00d3fdaea2924..8dfd4e2f26ecf8f 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -86,10 +86,14 @@ set(files
__algorithm/pstl_backends/cpu_backends/transform.h
__algorithm/pstl_backends/cpu_backends/transform_reduce.h
__algorithm/pstl_backends/gpu_backend.h
+ __algorithm/pstl_backends/gpu_backends/any_of.h
__algorithm/pstl_backends/gpu_backends/backend.h
__algorithm/pstl_backends/gpu_backends/fill.h
+ __algorithm/pstl_backends/gpu_backends/find_if.h
__algorithm/pstl_backends/gpu_backends/for_each.h
+ __algorithm/pstl_backends/gpu_backends/merge.h
__algorithm/pstl_backends/gpu_backends/omp_offload.h
+ __algorithm/pstl_backends/gpu_backends/stable_sort.h
__algorithm/pstl_backends/gpu_backends/transform.h
__algorithm/pstl_backends/gpu_backends/transform_reduce.h
__algorithm/pstl_copy.h
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index ea7f39dea905474..f41332fbf9f6d42 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -14,8 +14,12 @@
#include <__algorithm/pstl_backends/gpu_backends/backend.h>
#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
+# include <__algorithm/pstl_backends/gpu_backends/any_of.h>
# include <__algorithm/pstl_backends/gpu_backends/fill.h>
+# include <__algorithm/pstl_backends/gpu_backends/find_if.h>
# include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+# include <__algorithm/pstl_backends/gpu_backends/merge.h>
+# include <__algorithm/pstl_backends/gpu_backends/stable_sort.h>
# include <__algorithm/pstl_backends/gpu_backends/transform.h>
# include <__algorithm/pstl_backends/gpu_backends/transform_reduce.h>
#endif
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
new file mode 100644
index 000000000000000..8d911de55dcd685
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
+
+#include <__algorithm/any_of.h>
+#include <__algorithm/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__atomic/atomic.h>
+#include <__atomic/memory_order.h>
+#include <__config>
+#include <__functional/operations.h>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/pair.h>
+#include <__utility/terminate_on_exception.h>
+#include <cstdint>
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI bool
+__pstl_any_of(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+ // TODO: Implement GPU backend
+ return std::__pstl_any_of<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_ANY_OF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index 32926da87e2a083..8dc6bc6a6179c0e 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -14,6 +14,7 @@
#include <__algorithm/pstl_backends/gpu_backends/backend.h>
#include <__config>
#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
#include <__type_traits/is_execution_policy.h>
#include <__utility/terminate_on_exception.h>
#include <stdio.h>
@@ -33,23 +34,12 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
// parallel unsequenced, as it is the only execution policy prohibiting throwing
// exceptions and allowing SIMD instructions
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
}
- // Else if the excution policy is parallel, we execute for_each on the CPU instead
- else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
- std::__terminate_on_exception([&] {
- __par_backend::__parallel_for(
- __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
- std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
- __cpu_backend_tag{}, __brick_first, __brick_last, __value);
- });
- });
- // Else we execute for_each in serial
- } else {
- std::fill(__first, __last, __value);
- }
+ // Otherwise, we execute for_each on the CPU instead
+ return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
}
_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
new file mode 100644
index 000000000000000..2d34938f92dff38
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/find_if.h
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
+
+#include <__algorithm/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__atomic/atomic.h>
+#include <__config>
+#include <__functional/operations.h>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/pair.h>
+#include <__utility/terminate_on_exception.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI _ForwardIterator
+__pstl_find_if(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+ // TODO: Implement the GPU backend
+ return std::__pstl_find_if<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __pred);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_FIND_IF_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index 14de2af8e4a15c6..23c8da27e64ae3b 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -33,23 +33,12 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
// parallel unsequenced, as it is the only execution policy prohibiting throwing
// exceptions and allowing SIMD instructions
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
}
// Else if the excution policy is parallel, we execute for_each on the CPU instead
- else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
- std::__terminate_on_exception([&] {
- std::__par_backend::__parallel_for(
- __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
- std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
- __cpu_backend_tag{}, __brick_first, __brick_last, __func);
- });
- });
- // Else we execute for_each in serial
- } else {
- std::for_each(__first, __last, __func);
- }
+ return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
}
_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
new file mode 100644
index 000000000000000..bc947ebb27ac7f0
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/merge.h
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
+
+#include <__algorithm/merge.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/move.h>
+#include <__utility/terminate_on_exception.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy,
+ class _ForwardIterator1,
+ class _ForwardIterator2,
+ class _ForwardOutIterator,
+ class _Comp>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge(
+ __gpu_backend_tag,
+ _ForwardIterator1 __first1,
+ _ForwardIterator1 __last1,
+ _ForwardIterator2 __first2,
+ _ForwardIterator2 __last2,
+ _ForwardOutIterator __result,
+ _Comp __comp) {
+ // TODO: Implement GPU backend
+ return std::__pstl_merge<_ExecutionPolicy>(
+ __cpu_backend_tag{}, __first1, __last1, __first2, __last2, __result, __comp);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_MERGE_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index d1cc6133f8e0876..36acafd448ec003 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -11,9 +11,19 @@
#include <__assert>
#include <__config>
+#include <__functional/operations.h>
+#include <__iterator/wrap_iter.h>
+#include <__memory/addressof.h>
+#include <__memory/pointer_traits.h>
+#include <__type_traits/is_pointer.h>
+#include <__type_traits/is_same.h>
#include <__utility/move.h>
#include <cstddef>
+// is_same
+
+// __libcpp_is_contiguous_iterator
+
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif
@@ -30,27 +40,33 @@ inline namespace __omp_gpu_backend {
// Checking if a pointer is in a range
template <typename T1, typename T2, typename T3>
-_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1 a, T2 p, T3 b) {
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1, T2, T3) {
return false;
}
-template <typename T>
-_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T* a, T* p, T* b) {
- return std::less_equal<T*>{}(a, p) && std::less<T*>{}(p, b);
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(_Tp* a, _Tp* p, _Tp* b) {
+ return std::less_equal<_Tp*>{}(a, p) && std::less<_Tp*>{}(p, b);
}
// In OpenMP, we need to extract the pointer for the underlying data for data
// structures like std::vector and std::array to be able to map the data to the
// device.
-template <typename T>
-_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(T p) {
+template <typename _Tp, std::enable_if<std::is_pointer<_Tp>::value >::type* = 0>
+_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(_Tp p) {
return p;
}
-template <typename T>
-_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
- std::pointer_traits<std::__wrap_iter<T>> PT;
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline auto __omp_extract_base_ptr(_Tp p) {
+ return std::addressof(*p);
+ ;
+}
+
+template <typename _Tp>
+_LIBCPP_HIDE_FROM_ABI inline _Tp __omp_extract_base_ptr(std::__wrap_iter<_Tp> w) {
+ std::pointer_traits<std::__wrap_iter<_Tp>> PT;
return PT.to_address(w);
}
@@ -61,8 +77,8 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
// Applying function or lambda in a loop
template <class _Iterator, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI _Iterator
-__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f, const int __device = 0) noexcept {
+_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(
+ _Iterator __first, _DifferenceType __n, _Function __f, [[maybe_unused]] const int __device = 0) noexcept {
# pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) device(__device)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__first[__i]);
@@ -82,7 +98,7 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
template <class _Index, class _DifferenceType, class _Tp>
_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
- _Index __first, _DifferenceType __n, const _Tp& __value, const int __device = 0) noexcept {
+ _Index __first, _DifferenceType __n, const _Tp& __value, [[maybe_unused]] const int __device = 0) noexcept {
# pragma omp target teams distribute parallel for simd map(from : __first[0 : __n]) map(always, to : __value) \
device(__device)
for (_DifferenceType __i = 0; __i < __n; ++__i)
@@ -104,20 +120,24 @@ __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __valu
template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
- _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f, const int __device = 0) noexcept {
+ _Iterator1 __first1,
+ _DifferenceType __n,
+ _Iterator2 __first2,
+ _Function __f,
+ [[maybe_unused]] const int __device = 0) noexcept {
if ((!std::is_same<_Iterator1, _Iterator2>::value) ||
(std::is_same<_Iterator1, _Iterator2>::value &&
!__omp_gpu_backend::__omp_in_ptr_range(__first1, __first2, __first1 + __n))) {
# pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n]) map(from : __first2[0 : __n]) \
device(__device)
for (_DifferenceType __i = 0; __i < __n; ++__i)
- __f(__first1[__i], __first2[__i]);
+ __first2[__i] = __f(__first1[__i]);
return __first1 + __n;
}
# pragma omp target teams distribute parallel for simd map(tofrom : __first1[0 : __n], __first2[0 : __n]) \
device(__device)
for (_DifferenceType __i = 0; __i < __n; ++__i)
- __f(__first1[__i], __first2[__i]);
+ __first2[__i] = __f(__first1[__i]);
return __first1 + __n;
}
@@ -146,7 +166,7 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
_Iterator2 __first2,
_Iterator3 __first3,
_Function __f,
- const int __device = 0) noexcept {
+ [[maybe_unused]] const int __device = 0) noexcept {
// It may be that __first3 is in the interval [__first1+__n) or [__firt2+__n)
// It is, however, undefined behavior to compare two pointers that do not
// point to the same object or are not the same type.
@@ -165,14 +185,14 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3(
# pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n], __first2[0 : __n]) \
map(from : __first3[0 : __n]) device(__device)
for (_DifferenceType __i = 0; __i < __n; ++__i)
- __f(__first1[__i], __first2[__i], __first3[__i]);
+ __first3[__i] = __f(__first1[__i], __first2[__i]);
return __first1 + __n;
}
// In the general case, we have to map all data to and from the device
# pragma omp target teams distribute parallel for simd map( \
tofrom : __first1[0 : __n], __first2[0 : __n], __first3[0 : __n]) device(__device)
for (_DifferenceType __i = 0; __i < __n; ++__i)
- __f(__first1[__i], __first2[__i], __first3[__i]);
+ __first3[__i] = __f(__first1[__i], __first2[__i]);
return __first1 + __n;
}
@@ -197,46 +217,44 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3(
// General case
-# define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op) \
+# define __PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op) \
template <class _Iterator, \
class _DifferenceType, \
typename _Tp, \
typename _BinaryOperationType, \
- typename _UnaryOperation, \
- __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 > \
+ typename _UnaryOperation> \
_LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_1( \
_Iterator __first, \
_DifferenceType __n, \
_Tp __init, \
std_op<_BinaryOperationType> __reduce, \
- _UnaryOperation __transform, \
- const int __device = 0) noexcept { \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n]) device(__device)) \
- for (_DifferenceType __i = 0; __i < __n; ++__i) \
- __init = __reduce(__init, __transform(__first[__i])); \
- return __init; \
+ _UnaryOperation __transform/*, \
+ [[maybe_unused]] const int __device = 0*/) noexcept { \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first[0 : __n])) /*device(__device))*/ \
+ for (_DifferenceType __i = 0; __i < __n; ++__i) \
+ __init = __reduce(__init, __transform(__first[__i])); \
+ return __init; \
}
-# define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op) \
+# define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op) \
template <class _Iterator1, \
class _Iterator2, \
class _DifferenceType, \
typename _Tp, \
typename _BinaryOperationType, \
- typename _UnaryOperation, \
- __enable_if_t<is_arithmetic_v<_Tp>, int> = 0 > \
+ typename _UnaryOperation > \
_LIBCPP_HIDE_FROM_ABI _Tp __omp_parallel_for_simd_reduction_2( \
_Iterator1 __first1, \
_Iterator2 __first2, \
_DifferenceType __n, \
_Tp __init, \
std_op<_BinaryOperationType> __reduce, \
- _UnaryOperation __transform, \
- const int __device = 0) noexcept { \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]) device(__device)) \
- for (_DifferenceType __i = 0; __i < __n; ++__i) \
- __init = __reduce(__init, __transform(__first1[__i], __first2[__i])); \
- return __init; \
+ _UnaryOperation __transform/*, \
+ [[maybe_unused]] const int __device = 0*/) noexcept { \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]))/* device(__device))*/ \
+ for (_DifferenceType __i = 0; __i < __n; ++__i) \
+ __init = __reduce(__init, __transform(__first1[__i], __first2[__i])); \
+ return __init; \
}
# define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op) \
@@ -276,7 +294,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
_Tp __init,
_BinaryOperation __reduce,
_UnaryOperation __transform,
- const int __device = 0) noexcept {
+ [[maybe_unused]] const int __device = 0) noexcept {
return __omp_parallel_for_simd_reduction_1(
__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
}
@@ -294,7 +312,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_2(
_Tp __init,
_BinaryOperation __reduce,
_UnaryOperation __transform,
- const int __device = 0) noexcept {
+ [[maybe_unused]] const int __device = 0) noexcept {
return __omp_parallel_for_simd_reduction_2(
__omp_gpu_backend::__omp_extract_base_ptr(__first1),
__omp_gpu_backend::__omp_extract_base_ptr(__first2),
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
new file mode 100644
index 000000000000000..1760a9fd9fc9d32
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
+
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__algorithm/stable_sort.h>
+#include <__config>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_stable_sort(__gpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
+ // TODO: Implement GPU backend.
+ return __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_STABLE_SORT_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index 7fcfde44aaaa7a6..10f6e5ff174d675 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -37,30 +37,13 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
_UnaryOperation __op) {
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
- __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+ __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
// While the CPU backend captures by reference, [&], that is not valid when
// offloading to the GPU. Therefore we must capture by value, [=].
- return std::__par_backend::__parallel_for_simd_2(
- __first,
- __last - __first,
- __result,
- [=](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) {
- __out_value = __op(__in_value);
- });
- } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
- __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
- std::__terminate_on_exception([&] {
- std::__par_backend::__parallel_for(
- __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
- return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
- __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op);
- });
- });
- return __result + (__last - __first);
- } else {
- return std::transform(__first, __last, __result, __op);
+ return std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
}
+ return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
}
template <class _ExecutionPolicy,
@@ -79,39 +62,15 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
__has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
- __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
+ __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
// While the CPU backend captures by reference, [&], that is not valid when
// offloading to the GPU. Therefore we must capture by value, [=].
- return std::__par_backend::__parallel_for_simd_3(
- __first1,
- __last1 - __first1,
- __first2,
- __result,
- [=](__iter_reference<_ForwardIterator1> __in1,
- __iter_reference<_ForwardIterator2> __in2,
- __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); });
- } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
- __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
- std::__terminate_on_exception([&] {
- std::__par_backend::__parallel_for(
- __first1,
- __last1,
- [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) {
- return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
- __cpu_backend_tag{},
- __brick_first,
- __brick_last,
- __first2 + (__brick_first - __first1),
- __result + (__brick_first - __first1),
- __op);
- });
- });
- return __result + (__last1 - __first1);
- } else {
- return std::transform(__first1, __last1, __first2, __result, __op);
+ return std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
}
+ return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
}
_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
index 43e5631aef04afb..8590dd3d024ea69 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -12,9 +12,11 @@
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
#include <__algorithm/pstl_backends/gpu_backends/backend.h>
#include <__config>
+#include <__functional/operations.h>
#include <__iterator/concepts.h>
#include <__iterator/iterator_traits.h>
#include <__numeric/transform_reduce.h>
+#include <__type_traits/integral_constant.h>
#include <__type_traits/is_arithmetic.h>
#include <__type_traits/is_execution_policy.h>
#include <__type_traits/operation_traits.h>
@@ -28,6 +30,25 @@
#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+template <class _T1, class _T2, class _T3>
+struct __is_supported_reduction : std::false_type {};
+
+# define __PSTL_IS_SUPPORTED_REDUCTION(funname) \
+ template <class _Tp> \
+ struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {}; \
+ template <class _Tp, class _Up> \
+ struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+
+// __is_trivial_plus_operation already exists
+__PSTL_IS_SUPPORTED_REDUCTION(plus)
+__PSTL_IS_SUPPORTED_REDUCTION(minus)
+__PSTL_IS_SUPPORTED_REDUCTION(multiplies)
+__PSTL_IS_SUPPORTED_REDUCTION(logical_and)
+__PSTL_IS_SUPPORTED_REDUCTION(logical_or)
+__PSTL_IS_SUPPORTED_REDUCTION(bit_and)
+__PSTL_IS_SUPPORTED_REDUCTION(bit_or)
+__PSTL_IS_SUPPORTED_REDUCTION(bit_xor)
+
_LIBCPP_BEGIN_NAMESPACE_STD
//===----------------------------------------------------------------------===//
@@ -50,49 +71,16 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
_BinaryOperation2 __transform) {
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
+ __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+ (__is_trivial_plus_operation<_BinaryOperation1, _Tp, _Tp>::value ||
+ __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)) {
return std::__par_backend::__parallel_for_simd_reduction_2(
- std::move(__first1),
- std::move(__first2),
- __last1 - __first1,
- std::move(__init),
- std::move(__reduce),
- [=](__iter_reference<_ForwardIterator1> __in_value_1, __iter_reference<_ForwardIterator1> __in_value_2) {
- return __transform(__in_value_1, __in_value_2);
- });
- } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
- return std::__terminate_on_exception([&] {
- return __par_backend::__parallel_transform_reduce(
- __first1,
- std::move(__last1),
- [__first1, __first2, __transform](_ForwardIterator1 __iter) {
- return __transform(*__iter, *(__first2 + (__iter - __first1)));
- },
- std::move(__init),
- std::move(__reduce),
- [__first1, __first2, __reduce, __transform](
- _ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last, _Tp __brick_init) {
- return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
- __cpu_backend_tag{},
- __brick_first,
- std::move(__brick_last),
- __first2 + (__brick_first - __first1),
- std::move(__brick_init),
- std::move(__reduce),
- std::move(__transform));
- });
- });
- } else {
- return std::transform_reduce(
- std::move(__first1),
- std::move(__last1),
- std::move(__first2),
- std::move(__init),
- std::move(__reduce),
- std::move(__transform));
+ __first1, __first2, __last1 - __first1, __init, __reduce, __transform);
}
+ return std::__pstl_transform_reduce<_ExecutionPolicy>(
+ __cpu_backend_tag{}, __first1, __last1, __first2, std::move(__init), __reduce, __transform);
}
//===----------------------------------------------------------------------===//
@@ -108,36 +96,15 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
_BinaryOperation __reduce,
_UnaryOperation __transform) {
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
+ (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
+ __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)) {
return std::__par_backend::__parallel_for_simd_reduction_1(
- std::move(__first),
- __last - __first,
- std::move(__init),
- std::move(__reduce),
- [=](__iter_reference<_ForwardIterator> __in_value) { return __transform(__in_value); });
- } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
- return std::__terminate_on_exception([&] {
- return __par_backend::__parallel_transform_reduce(
- std::move(__first),
- std::move(__last),
- [__transform](_ForwardIterator __iter) { return __transform(*__iter); },
- std::move(__init),
- __reduce,
- [__transform, __reduce](auto __brick_first, auto __brick_last, _Tp __brick_init) {
- return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
- __cpu_backend_tag{},
- std::move(__brick_first),
- std::move(__brick_last),
- std::move(__brick_init),
- std::move(__reduce),
- std::move(__transform));
- });
- });
- } else {
- return std::transform_reduce(
- std::move(__first), std::move(__last), std::move(__init), std::move(__reduce), std::move(__transform));
+ __first, __last - __first, __init, __reduce, __transform);
}
+ return std::__pstl_transform_reduce<_ExecutionPolicy>(
+ __cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
}
_LIBCPP_END_NAMESPACE_STD
>From 94474817194a47d06a11567dd0e9aeabc7b6da63 Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Tue, 26 Sep 2023 20:15:33 -0700
Subject: [PATCH 08/10] Missing return statements in fill and for_each
---
libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h | 2 +-
.../include/__algorithm/pstl_backends/gpu_backends/for_each.h | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index 8dc6bc6a6179c0e..d109495009df895 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -36,7 +36,7 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
__libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
- std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+ return std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
}
// Otherwise, we execute for_each on the CPU instead
return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index 23c8da27e64ae3b..bab0c87de8f2fc7 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -35,7 +35,7 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
__libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
- std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+ return std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
}
// Else if the excution policy is parallel, we execute for_each on the CPU instead
return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
>From 96adadf8f7227f6543537056f27f98cb18bbe8ce Mon Sep 17 00:00:00 2001
From: AntonRydahl <rydahl2610 at gmail.com>
Date: Wed, 27 Sep 2023 10:31:16 -0700
Subject: [PATCH 09/10] Passing all LIT tests
---
.../__algorithm/pstl_backends/gpu_backends/fill.h | 10 ++++++----
.../pstl_backends/gpu_backends/for_each.h | 8 +++++---
.../pstl_backends/gpu_backends/stable_sort.h | 2 +-
.../pstl_backends/gpu_backends/transform.h | 15 ++++++++-------
.../pstl_backends/gpu_backends/transform_reduce.h | 6 +++---
5 files changed, 23 insertions(+), 18 deletions(-)
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index d109495009df895..f32ee8b016b3eae 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -30,16 +30,18 @@ _LIBCPP_BEGIN_NAMESPACE_STD
template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
_LIBCPP_HIDE_FROM_ABI void
__pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
- // It is only safe to execute for_each on the GPU, it the execution policy is
+ // It is only safe to execute fill on the GPU, it the execution policy is
// parallel unsequenced, as it is the only execution policy prohibiting throwing
// exceptions and allowing SIMD instructions
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
__libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
- return std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+ std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+ }
+ // Otherwise, we execute fill on the CPU instead
+ else {
+ std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
}
- // Otherwise, we execute for_each on the CPU instead
- return std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
}
_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index bab0c87de8f2fc7..f96b30b5ba25b24 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -35,10 +35,12 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
__libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
- return std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+ std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+ }
+ // Else we fall back to the GPU backend
+ else {
+ std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
}
- // Else if the excution policy is parallel, we execute for_each on the CPU instead
- return std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
}
_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
index 1760a9fd9fc9d32..5cd7081ef73e9c1 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/stable_sort.h
@@ -28,7 +28,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
_LIBCPP_HIDE_FROM_ABI void
__pstl_stable_sort(__gpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
// TODO: Implement GPU backend.
- return __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
+ __pstl_stable_sort<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __comp);
}
_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index 10f6e5ff174d675..c2e43cb6d643375 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -38,11 +38,12 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
__has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
- __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
- // While the CPU backend captures by reference, [&], that is not valid when
- // offloading to the GPU. Therefore we must capture by value, [=].
- return std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
+ __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+ __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
+ std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
+ return __result + (__last - __first);
}
+ // If it is not safe to offload to the GPU, we rely on the CPU backend.
return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
}
@@ -66,10 +67,10 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
__libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
__libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
__libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
- // While the CPU backend captures by reference, [&], that is not valid when
- // offloading to the GPU. Therefore we must capture by value, [=].
- return std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
+ std::__par_backend::__parallel_for_simd_3(__first1, __last1 - __first1, __first2, __result, __op);
+ return __result + (__last1 - __first1);
}
+ // If it is not safe to offload to the GPU, we rely on the CPU backend.
return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
}
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
index 8590dd3d024ea69..332bb8abc1b8e0b 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -31,13 +31,13 @@
#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
template <class _T1, class _T2, class _T3>
-struct __is_supported_reduction : std::false_type {};
+_LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction : std::false_type {};
# define __PSTL_IS_SUPPORTED_REDUCTION(funname) \
template <class _Tp> \
- struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {}; \
+ _LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction<std::funname<_Tp>, _Tp, _Tp> : std::true_type {}; \
template <class _Tp, class _Up> \
- struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
+ _LIBCPP_HIDE_FROM_ABI struct __is_supported_reduction<std::funname<>, _Tp, _Up> : std::true_type {};
// __is_trivial_plus_operation already exists
__PSTL_IS_SUPPORTED_REDUCTION(plus)
>From 1c5b0b26f7573d9024dbee72baec1db37a05a415 Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136 at gmail.com>
Date: Mon, 2 Oct 2023 01:22:07 +0100
Subject: [PATCH 10/10] Patches to enable BabelStream, miniBUDE, CloverLeaf,
and TeaLeaf Abort on CPU path for now
---
clang/lib/Headers/__clang_hip_math.h | 2 +-
libcxx/include/CMakeLists.txt | 1 +
.../__algorithm/pstl_backends/gpu_backend.h | 1 +
.../pstl_backends/gpu_backends/copy.h | 57 +++++++
.../pstl_backends/gpu_backends/fill.h | 1 +
.../pstl_backends/gpu_backends/for_each.h | 6 +-
.../pstl_backends/gpu_backends/omp_offload.h | 147 ++++++++++++++++--
.../pstl_backends/gpu_backends/transform.h | 9 +-
.../gpu_backends/transform_reduce.h | 21 ++-
9 files changed, 214 insertions(+), 31 deletions(-)
create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/copy.h
diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h
index 58aa55d74769031..3644cf603c78241 100644
--- a/clang/lib/Headers/__clang_hip_math.h
+++ b/clang/lib/Headers/__clang_hip_math.h
@@ -15,7 +15,7 @@
#if !defined(__HIPCC_RTC__)
#if defined(__cplusplus)
-#include <algorithm>
+// #include <algorithm>
#endif
#include <limits.h>
#include <stdint.h>
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 8dfd4e2f26ecf8f..145485064f47aa3 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -89,6 +89,7 @@ set(files
__algorithm/pstl_backends/gpu_backends/any_of.h
__algorithm/pstl_backends/gpu_backends/backend.h
__algorithm/pstl_backends/gpu_backends/fill.h
+ __algorithm/pstl_backends/gpu_backends/copy.h
__algorithm/pstl_backends/gpu_backends/find_if.h
__algorithm/pstl_backends/gpu_backends/for_each.h
__algorithm/pstl_backends/gpu_backends/merge.h
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
index f41332fbf9f6d42..6e35d59e0718679 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -16,6 +16,7 @@
#if defined(_LIBCPP_PSTL_GPU_OFFLOAD)
# include <__algorithm/pstl_backends/gpu_backends/any_of.h>
# include <__algorithm/pstl_backends/gpu_backends/fill.h>
+# include <__algorithm/pstl_backends/gpu_backends/copy.h>
# include <__algorithm/pstl_backends/gpu_backends/find_if.h>
# include <__algorithm/pstl_backends/gpu_backends/for_each.h>
# include <__algorithm/pstl_backends/gpu_backends/merge.h>
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/copy.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/copy.h
new file mode 100644
index 000000000000000..6d080c17ce200af
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/copy.h
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_COPY_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_COPY_H
+
+#include <__algorithm/copy.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+#include <stdio.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator>
+_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator
+__pstl_copy(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _ForwardOutIterator __result) {
+ if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
+ __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value
+ // __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+ // __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value
+ ) {
+ // XXX There's an even faster path that calls omp_target_memcpy, the slow path with __identity() will map your data
+ // which is catastrophic for USM inputs in terms of performance
+ std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result);
+ return __result + (__last - __first);
+ }
+ std::abort();
+ // If it is not safe to offload to the GPU, we rely on the CPU backend.
+ return std::__pstl_transform<_ExecutionPolicy>(
+ __cpu_backend_tag{},
+ __first,
+ __last,
+ __result, //
+ __identity());
+}
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_COPY_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
index f32ee8b016b3eae..364de22271daff6 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -40,6 +40,7 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last
}
// Otherwise, we execute fill on the CPU instead
else {
+ std::abort();
std::__pstl_fill<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __value);
}
}
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
index f96b30b5ba25b24..db70f0948e40f7f 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -33,12 +33,14 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __
// parallel unsequenced, as it is the only execution policy prohibiting throwing
// exceptions and allowing SIMD instructions
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
- __libcpp_is_contiguous_iterator<_ForwardIterator>::value) {
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value
+// __libcpp_is_contiguous_iterator<_ForwardIterator>::value
+ ) {
std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
}
// Else we fall back to the GPU backend
else {
+ std::abort();
std::__pstl_for_each<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __func);
}
}
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
index 36acafd448ec003..3470ab95d98522c 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -90,7 +90,13 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(
template <class _Iterator, class _DifferenceType, class _Function>
_LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
- __omp_parallel_for_simd_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
+ if constexpr (std::is_reference_v<decltype(__first[0])>) {
+ __omp_parallel_for_simd_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
+ } else {
+# pragma omp target teams distribute parallel for simd /* device(__device) */
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ __f(*(__first + __i));
+ }
return __first + __n;
}
@@ -99,11 +105,14 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ
template <class _Index, class _DifferenceType, class _Tp>
_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(
_Index __first, _DifferenceType __n, const _Tp& __value, [[maybe_unused]] const int __device = 0) noexcept {
+ if constexpr (std::is_reference_v<decltype(__first[0])>) {
# pragma omp target teams distribute parallel for simd map(from : __first[0 : __n]) map(always, to : __value) \
device(__device)
- for (_DifferenceType __i = 0; __i < __n; ++__i)
- __first[__i] = __value;
-
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ __first[__i] = __value;
+ } else {
+ static_assert(false, "Cannot offload iterators not backed by a pointer");
+ }
return __first + __n;
}
@@ -147,11 +156,55 @@ _LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2(
template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
_LIBCPP_HIDE_FROM_ABI _Iterator1
__parallel_for_simd_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
- __omp_parallel_for_simd_2(
- __omp_gpu_backend::__omp_extract_base_ptr(__first1),
- __n,
- __omp_gpu_backend::__omp_extract_base_ptr(__first2),
- __f);
+ // The second iterator must be backed by a pointer as we're writing to it
+ static_assert(std::is_reference_v<decltype(__first2[0])>, "Destination iterator is read only");
+ if constexpr (std::is_reference_v<decltype(__first1[0])>) {
+ __omp_parallel_for_simd_2(
+ __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+ __n,
+ __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+ __f);
+ } else {
+ // XXX first1 iterator is counting, first1 and first2 will be different here
+# pragma omp target teams distribute parallel for simd /* device(__device) */
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ *(__first2 + __i) = __f(*(__first1 + __i));
+ }
+ return __first1 + __n;
+}
+
+extern "C" int omp_target_memcpy(
+ void* dst,
+ const void* src,
+ size_t length,
+ size_t dst_offset,
+ size_t src_offset,
+ int dst_device_num,
+ int src_device_num);
+
+template <class _Iterator1, class _DifferenceType, class _Iterator2>
+_LIBCPP_HIDE_FROM_ABI _Iterator1
+__parallel_for_simd_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2) noexcept {
+ // The second iterator must be backed by a pointer as we're writing to it
+ static_assert(std::is_reference_v<decltype(__first2[0])>, "Destination iterator is read only");
+ if constexpr (std::is_reference_v<decltype(__first1[0])>) {
+ // Host and dest are both pointers,
+ int r = omp_target_memcpy(
+ __omp_gpu_backend::__omp_extract_base_ptr(__first2),
+ __omp_gpu_backend::__omp_extract_base_ptr(__first1),
+ __n * sizeof(typename std::iterator_traits<_Iterator1>::value_type),
+ 0,
+ 0,
+ 0,
+ 0);
+ if (r != 0)
+ std::abort();
+ } else {
+ // XXX first1 iterator is counting, first1 and first2 will be different here
+# pragma omp target teams distribute parallel for simd /* device(__device) */
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ *(__first2 + __i) = *(__first1 + __i);
+ }
return __first1 + __n;
}
@@ -236,7 +289,7 @@ _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__in
return __init; \
}
-# define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op) \
+# define __PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op) \
template <class _Iterator1, \
class _Iterator2, \
class _DifferenceType, \
@@ -250,11 +303,11 @@ _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__in
_Tp __init, \
std_op<_BinaryOperationType> __reduce, \
_UnaryOperation __transform/*, \
- [[maybe_unused]] const int __device = 0*/) noexcept { \
-_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) map(to : __first1[0 : __n], __first2[0 : __n]))/* device(__device))*/ \
- for (_DifferenceType __i = 0; __i < __n; ++__i) \
- __init = __reduce(__init, __transform(__first1[__i], __first2[__i])); \
- return __init; \
+ [[maybe_unused]] const int __device = 0*/) noexcept { \
+_PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(omp_op:__init) )/* device(__device))*/ \
+ for (_DifferenceType __i = 0; __i < __n; ++__i) \
+ __init = __reduce(__init, __transform(__first1[__i], __first2[__i])); \
+ return __init; \
}
# define __PSTL_OMP_SIMD_REDUCTION(omp_op, std_op) \
@@ -287,6 +340,57 @@ __PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor)
// Extracting the underlying pointers
+template <class F, class... Ts>
+auto __lift_fn(Ts&&... xs) {
+# if __cplusplus >= 202002L
+ return F{}(std::forward<Ts>(xs)...);
+# else
+ static_assert(std::is_empty<F>(), "Can't lift lambda with captures");
+ return (*reinterpret_cast<const F*>(0))(std::forward<Ts>(xs)...);
+# endif
+}
+
+template <class _Iterator, class _DifferenceType, typename _Tp, typename _BinaryOperation, typename _UnaryOperation>
+_Tp __transform_reduce0(
+ _Iterator __first,
+ _DifferenceType __n,
+ _Tp __init,
+ _BinaryOperation __reduce,
+ _UnaryOperation __transform,
+ [[maybe_unused]] const int __device = 0) noexcept {
+ struct alignas(16)
+ __reducer { // TODO switch __lift_fn for the 0-length base: struct __reducer : _BinaryOperation {...}
+ _Tp value;
+ bool init = false;
+ __reducer operator+(__reducer that) const {
+ return (init && that.init) ? __reducer{__lift_fn<_BinaryOperation>(value, that.value), true}
+ : (init ? *this : that);
+ }
+
+ // __reducer& operator+=(__reducer &that) {
+ // value = (init && that.init) ? __lift_fn<_BinaryOperation>(value, that.value): (init ? value : that.value);
+ // init = (init && that.init) ? true: (init ? init : that.init);
+ // return *this;
+ // }
+ };
+
+ auto __result0 = __reducer{__init};
+# pragma omp target teams distribute parallel for simd reduction(+ : __result0) \
+ map(to : __first, __transform, __reduce) /*device(__device))*/
+ for (decltype(__n) __i = 0; __i < __n; __i += 2) {
+ __result0.init = true;
+ __result0.value =
+ __i + 1 >= __n //
+ ? __transform(*(__first + __i))
+ : __lift_fn<_BinaryOperation>(__transform(*(__first + __i)), __transform(*(__first + (__i + 1))));
+ }
+ // for (decltype(__n) __i = 0; __i < __n; ++__i) {
+ // __result0 = __result0 + __reducer{__transform(*(__first + __i)), true};
+ // }
+ //
+ return __result0.value;
+}
+
template <class _Iterator, class _DifferenceType, typename _Tp, typename _BinaryOperation, typename _UnaryOperation >
_LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
_Iterator __first,
@@ -295,8 +399,17 @@ _LIBCPP_HIDE_FROM_ABI _Tp __parallel_for_simd_reduction_1(
_BinaryOperation __reduce,
_UnaryOperation __transform,
[[maybe_unused]] const int __device = 0) noexcept {
- return __omp_parallel_for_simd_reduction_1(
- __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
+ if constexpr (std::is_reference_v<decltype(__first[0])>) {
+ return __omp_parallel_for_simd_reduction_1(
+ __omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __init, __reduce, __transform);
+ } else {
+ // _PSTL_PRAGMA(omp target teams distribute parallel for simd reduction(+:__init) map(to : __first ))
+ // /*device(__device))*/ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ // __init = __reduce(__init, __transform(__first[__i]));
+ // return __init;
+
+ return __transform_reduce0(__first, __n, __init, __reduce, __transform);
+ }
}
template <class _Iterator1,
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
index c2e43cb6d643375..e26947c93feace5 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h
@@ -37,12 +37,14 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
_UnaryOperation __op) {
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
- __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value &&
- __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
- __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value) {
+ __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value
+// __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+// __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value
+ ) {
std::__par_backend::__parallel_for_simd_2(__first, __last - __first, __result, __op);
return __result + (__last - __first);
}
+ std::abort();
// If it is not safe to offload to the GPU, we rely on the CPU backend.
return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first, __last, __result, __op);
}
@@ -71,6 +73,7 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
return __result + (__last1 - __first1);
}
// If it is not safe to offload to the GPU, we rely on the CPU backend.
+ std::abort();
return std::__pstl_transform<_ExecutionPolicy>(__cpu_backend_tag{}, __first1, __last1, __first2, __result, __op);
}
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
index 332bb8abc1b8e0b..50af5f2a35cd805 100644
--- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h
@@ -22,7 +22,6 @@
#include <__type_traits/operation_traits.h>
#include <__utility/move.h>
#include <__utility/terminate_on_exception.h>
-#include <new>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
@@ -72,13 +71,16 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
__has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
- __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
- __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> &&
+// __libcpp_is_contiguous_iterator<_ForwardIterator1>::value &&
+// __libcpp_is_contiguous_iterator<_ForwardIterator2>::value &&
+// is_arithmetic_v<_Tp> &&
(__is_trivial_plus_operation<_BinaryOperation1, _Tp, _Tp>::value ||
- __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)) {
+ __is_supported_reduction<_BinaryOperation1, _Tp, _Tp>::value)
+ ) {
return std::__par_backend::__parallel_for_simd_reduction_2(
__first1, __first2, __last1 - __first1, __init, __reduce, __transform);
}
+ std::abort();
return std::__pstl_transform_reduce<_ExecutionPolicy>(
__cpu_backend_tag{}, __first1, __last1, __first2, std::move(__init), __reduce, __transform);
}
@@ -96,13 +98,16 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
_BinaryOperation __reduce,
_UnaryOperation __transform) {
if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
- __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
- __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> &&
- (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
- __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)) {
+ __has_random_access_iterator_category_or_concept<_ForwardIterator>::value
+// __libcpp_is_contiguous_iterator<_ForwardIterator>::value &&
+// is_arithmetic_v<_Tp> && true
+// (__is_trivial_plus_operation<_BinaryOperation, _Tp, _Tp>::value ||
+// __is_supported_reduction<_BinaryOperation, _Tp, _Tp>::value)
+ ) {
return std::__par_backend::__parallel_for_simd_reduction_1(
__first, __last - __first, __init, __reduce, __transform);
}
+ std::abort();
return std::__pstl_transform_reduce<_ExecutionPolicy>(
__cpu_backend_tag{}, __first, __last, std::move(__init), __reduce, __transform);
}
More information about the cfe-commits
mailing list