[libcxx-commits] [libcxx] [RFC] Offloading C++ standard parallel algorithms to GPUs using OpenMP (PR #66465)
Anton Rydahl via libcxx-commits
libcxx-commits at lists.llvm.org
Wed Sep 20 10:29:59 PDT 2023
https://github.com/AntonRydahl updated https://github.com/llvm/llvm-project/pull/66465
>From f5f6b3ff15a2e26656221950e8e3fbe85766a0ef Mon Sep 17 00:00:00 2001
From: antonrydahl <rydahl2610 at gmail.com>
Date: Thu, 14 Sep 2023 23:43:43 -0700
Subject: [PATCH 1/4] Proof of concept for offloading C++ standard parallel
algorithms to the GPU using OpenMP. To enable offloading, compile with
-D_LIBCPP_ENABLE_OPENMP_OFFLOAD
---
.../pstl_backends/cpu_backends/fill.h | 2 +-
.../pstl_backends/cpu_backends/for_each.h | 21 ++-
.../pstl_backends/cpu_backends/transform.h | 28 +++-
.../cpu_backends/transform_reduce.h | 8 +-
libcxx/include/__config | 121 +++++++++++++++++-
5 files changed, 169 insertions(+), 11 deletions(-)
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
index 8b531887c731808..7f6687306970364 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
@@ -27,7 +27,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
template <class _Index, class _DifferenceType, class _Tp>
_LIBCPP_HIDE_FROM_ABI _Index __simd_fill_n(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
_PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
- _PSTL_PRAGMA_SIMD
+ _PSTL_PRAGMA_SIMD()
for (_DifferenceType __i = 0; __i < __n; ++__i)
__first[__i] = __value;
return __first + __n;
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
index f6f22fdd8713ced..1155c070496c87c 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
@@ -26,10 +26,29 @@ _LIBCPP_BEGIN_NAMESPACE_STD
template <class _Iterator, class _DifferenceType, class _Function>
_LIBCPP_HIDE_FROM_ABI _Iterator __simd_walk_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
- _PSTL_PRAGMA_SIMD
+ _PSTL_OMP_MAP_TO(__first,__n);
+ _PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__first[__i]);
+ _PSTL_OMP_MAP_FROM(__first,__n);
+ return __first + __n;
+}
+/**
+ * Specialization for std::vector where the base pointer must be extrated to map
+ * the data to and from the GPU.
+*/
+
+template <typename T, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T*> __simd_walk_1(std::__wrap_iter<T*> __first, _DifferenceType __n, _Function __f) noexcept {
+ _PSTL_OMP_MAP_TO(__first,__n);
+ // For std::vector the base pointer of the data buffer needs to be extracted
+ std::pointer_traits<std::__wrap_iter<T*>> PT;
+ T* data = PT.to_address(__first);
+ _PSTL_PRAGMA_SIMD(__n)
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ __f(data[__i]);
+ _PSTL_OMP_MAP_FROM(__first,__n);
return __first + __n;
}
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
index 0259d8a84bb3f76..6611be3b7a44833 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
@@ -30,9 +30,33 @@ _LIBCPP_BEGIN_NAMESPACE_STD
template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
_LIBCPP_HIDE_FROM_ABI _Iterator2
__simd_walk_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
- _PSTL_PRAGMA_SIMD
+ _PSTL_OMP_MAP_TO(__first1,__n);
+ _PSTL_OMP_MAP_ALLOC(__first2,__n);
+ _PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__first1[__i], __first2[__i]);
+ _PSTL_OMP_MAP_FROM(__first2,__n);
+ return __first2 + __n;
+}
+
+/**
+ * Specialization for std::vector where the base pointer must be extrated to map
+ * the data to and from the GPU.
+*/
+
+template <typename T1, class _DifferenceType, typename T2, class _Function>
+_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T1*>
+__simd_walk_2(std::__wrap_iter<T1*> __first1, _DifferenceType __n, std::__wrap_iter<T2*> __first2, _Function __f) noexcept {
+ _PSTL_OMP_MAP_TO(__first1,__n);
+ _PSTL_OMP_MAP_ALLOC(__first2,__n);
+ std::pointer_traits<std::__wrap_iter<T1*>> PT1;
+ std::pointer_traits<std::__wrap_iter<T2*>> PT2;
+ T1* __data1 = PT1.to_address(__first1);
+ T2* __data2 = PT2.to_address(__first2);
+ _PSTL_PRAGMA_SIMD(__n)
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ __f(__data1[__i], __data2[__i]);
+ _PSTL_OMP_MAP_FROM(__first2,__n);
return __first2 + __n;
}
@@ -72,7 +96,7 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
_LIBCPP_HIDE_FROM_ABI _Iterator3 __simd_walk_3(
_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
- _PSTL_PRAGMA_SIMD
+ _PSTL_PRAGMA_SIMD()
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__first1[__i], __first2[__i], __first3[__i]);
return __first3 + __n;
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
index 2afe5c7d10483fe..885648982409cb9 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
@@ -57,7 +57,7 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un
_Tp* __lane = reinterpret_cast<_Tp*>(__lane_buffer);
// initializer
- _PSTL_PRAGMA_SIMD
+ _PSTL_PRAGMA_SIMD()
for (_Size __i = 0; __i < __block_size; ++__i) {
::new (__lane + __i) _Tp(__binary_op(__f(__i), __f(__block_size + __i)));
}
@@ -65,13 +65,13 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un
_Size __i = 2 * __block_size;
const _Size __last_iteration = __block_size * (__n / __block_size);
for (; __i < __last_iteration; __i += __block_size) {
- _PSTL_PRAGMA_SIMD
+ _PSTL_PRAGMA_SIMD()
for (_Size __j = 0; __j < __block_size; ++__j) {
__lane[__j] = __binary_op(std::move(__lane[__j]), __f(__i + __j));
}
}
// remainder
- _PSTL_PRAGMA_SIMD
+ _PSTL_PRAGMA_SIMD()
for (_Size __j = 0; __j < __n - __last_iteration; ++__j) {
__lane[__j] = __binary_op(std::move(__lane[__j]), __f(__last_iteration + __j));
}
@@ -80,7 +80,7 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un
__init = __binary_op(std::move(__init), std::move(__lane[__j]));
}
// destroyer
- _PSTL_PRAGMA_SIMD
+ _PSTL_PRAGMA_SIMD()
for (_Size __j = 0; __j < __block_size; ++__j) {
__lane[__j].~_Tp();
}
diff --git a/libcxx/include/__config b/libcxx/include/__config
index bf2564e2732ba98..087c5c91235511d 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1414,8 +1414,118 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
// Enable SIMD for compilers that support OpenMP 4.0
# if (defined(_OPENMP) && _OPENMP >= 201307)
+# ifdef _LIBCPP_ENABLE_OPENMP_OFFLOAD
+# ifndef _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES
+# define _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES 32768
+# endif
+#include <omp.h>
+#include <__iterator/wrap_iter.h>
+# define _PSTL_PRAGMA_DATA_MAP_TO(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(to:NAME[:LEN]))
+# define _PSTL_PRAGMA_DATA_MAP_ALLOC(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(alloc:NAME[:LEN]))
+# define _PSTL_PRAGMA_DATA_MAP_FROM(NAME,LEN) _PSTL_PRAGMA(omp target update from(NAME[:LEN]))
+
+template <typename N>
+bool constexpr OMPIsOffloadable(N size)
+{
+ return size >= _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES;
+}
+
+bool constexpr OMPIsOffloadable(void)
+{
+ return false;
+}
+
+template <typename T, typename N>
+void inline OMPMapToIf(T data,N length,int device = omp_get_default_device())
+{
+ // If the data is already present on the device, there is no need
+ // transfer the data again.
+ if (omp_target_is_present(data,device)){
+ return;
+ }
+ // If it is a small amount of data it does not make sense to offload to a
+ // device
+ if (!OMPIsOffloadable(length)){
+ return;
+ }
+ _PSTL_PRAGMA_DATA_MAP_TO(data,length);
+}
+
+template <typename T, typename N>
+void inline OMPMapAllocIf(T data,N length,int device = omp_get_default_device())
+{
+ // If the data is already present on the device, there is no need
+ // transfer the data again.
+ if (omp_target_is_present(data,device)){
+ return;
+ }
+ // If it is a small amount of data it does not make sense to offload to a
+ // device
+ if (!OMPIsOffloadable(length)){
+ return;
+ }
+ _PSTL_PRAGMA_DATA_MAP_ALLOC(data,length);
+}
+
+template <typename T, typename N>
+void inline OMPMapTo(T data,N length) {
+ OMPMapToIf(data,length);
+}
+
+/**
+ * Specialization for std::vector
+*/
+
+template <typename T,typename N>
+void inline OMPMapTo(std::__wrap_iter<T*> w,N length) {
+ std::pointer_traits<std::__wrap_iter<T*>> PT;
+ T* data = PT.to_address(w);
+ OMPMapToIf(data,length);
+}
+
+template <typename T, typename N>
+void inline OMPMapAlloc(T data,N length) {
+ OMPMapAllocIf(data,length);
+}
+
+/**
+ * Specialization for std::vector
+*/
+
+template <typename T,typename N>
+void inline OMPMapAlloc(std::__wrap_iter<T*> w,N length) {
+ std::pointer_traits<std::__wrap_iter<T*>> PT;
+ T* data = PT.to_address(w);
+ OMPMapAllocIf(data,length);
+}
+
+template <typename T,typename N>
+void inline OMPMapFrom(T data,N length) {
+ _PSTL_PRAGMA_DATA_MAP_FROM(data,length);
+}
+
+/**
+ * Specialization for std::vector
+*/
+
+template <typename T,typename N>
+void inline OMPMapFrom(std::__wrap_iter<T*> w,N length) {
+ std::pointer_traits<std::__wrap_iter<T*>> PT;
+ T* data = PT.to_address(w);
+ _PSTL_PRAGMA_DATA_MAP_FROM(data,length);
+}
+# define _PSTL_OMP_MAP_TO(DATA,LEN) OMPMapTo(DATA,LEN)
+# define _PSTL_OMP_MAP_ALLOC(DATA,LEN) OMPMapAlloc(DATA,LEN)
+# define _PSTL_OMP_MAP_FROM(DATA,LEN) OMPMapFrom(DATA,LEN)
+# define _PSTL_PRAGMA_SIMD(...) _PSTL_PRAGMA(omp target teams distribute parallel for simd if(OMPIsOffloadable(__VA_ARGS__)))
+# else
+# define _PSTL_PRAGMA_SIMD(...) _PSTL_PRAGMA(omp simd)
+# define _PSTL_OMP_MAP_TO(DATA,LEN)
+# define _PSTL_OMP_MAP_ALLOC(DATA,LEN)
+# define _PSTL_OMP_MAP_FROM(DATA,LEN)
+# endif
+
# define _PSTL_UDR_PRESENT
-# define _PSTL_PRAGMA_SIMD _PSTL_PRAGMA(omp simd)
# define _PSTL_PRAGMA_DECLARE_SIMD _PSTL_PRAGMA(omp declare simd)
# define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) _PSTL_PRAGMA(omp simd reduction(PRM))
# define _PSTL_PRAGMA_SIMD_SCAN(PRM) _PSTL_PRAGMA(omp simd reduction(inscan, PRM))
@@ -1434,7 +1544,7 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
# elif defined(_LIBCPP_COMPILER_CLANG_BASED)
-# define _PSTL_PRAGMA_SIMD _Pragma("clang loop vectorize(enable) interleave(enable)")
+# define _PSTL_PRAGMA_SIMD(...) _Pragma("clang loop vectorize(enable) interleave(enable)")
# define _PSTL_PRAGMA_DECLARE_SIMD
# define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) _Pragma("clang loop vectorize(enable) interleave(enable)")
# define _PSTL_PRAGMA_SIMD_SCAN(PRM) _Pragma("clang loop vectorize(enable) interleave(enable)")
@@ -1444,7 +1554,7 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
# else // (defined(_OPENMP) && _OPENMP >= 201307)
-# define _PSTL_PRAGMA_SIMD
+# define _PSTL_PRAGMA_SIMD(...)
# define _PSTL_PRAGMA_DECLARE_SIMD
# define _PSTL_PRAGMA_SIMD_REDUCTION(PRM)
# define _PSTL_PRAGMA_SIMD_SCAN(PRM)
@@ -1454,6 +1564,11 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
# endif // (defined(_OPENMP) && _OPENMP >= 201307)
+# ifndef _LIBCPP_ENABLE_OPENMP_OFFLOAD
+# define _PSTL_OMP_MAP_TO(...)
+# define _PSTL_OMP_MAP_FROM(...)
+# endif
+
# define _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
#endif // __cplusplus
>From 897ee3a2029ee77ec56ac62d4f4e7a409e9308f8 Mon Sep 17 00:00:00 2001
From: antonrydahl <rydahl2610 at gmail.com>
Date: Fri, 15 Sep 2023 10:29:04 -0700
Subject: [PATCH 2/4] Clang-formatted for_Each.h and transform.h
---
.../pstl_backends/cpu_backends/for_each.h | 15 ++++++++-------
.../pstl_backends/cpu_backends/transform.h | 18 +++++++++---------
2 files changed, 17 insertions(+), 16 deletions(-)
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
index 1155c070496c87c..ed336766295cb07 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
@@ -26,29 +26,30 @@ _LIBCPP_BEGIN_NAMESPACE_STD
template <class _Iterator, class _DifferenceType, class _Function>
_LIBCPP_HIDE_FROM_ABI _Iterator __simd_walk_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
- _PSTL_OMP_MAP_TO(__first,__n);
+ _PSTL_OMP_MAP_TO(__first, __n);
_PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__first[__i]);
- _PSTL_OMP_MAP_FROM(__first,__n);
+ _PSTL_OMP_MAP_FROM(__first, __n);
return __first + __n;
}
/**
* Specialization for std::vector where the base pointer must be extrated to map
* the data to and from the GPU.
-*/
+ */
template <typename T, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T*> __simd_walk_1(std::__wrap_iter<T*> __first, _DifferenceType __n, _Function __f) noexcept {
- _PSTL_OMP_MAP_TO(__first,__n);
- // For std::vector the base pointer of the data buffer needs to be extracted
+_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T*>
+__simd_walk_1(std::__wrap_iter<T*> __first, _DifferenceType __n, _Function __f) noexcept {
+ _PSTL_OMP_MAP_TO(__first, __n);
+ // For std::vector the base pointer of the data buffer needs to be extracted
std::pointer_traits<std::__wrap_iter<T*>> PT;
T* data = PT.to_address(__first);
_PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(data[__i]);
- _PSTL_OMP_MAP_FROM(__first,__n);
+ _PSTL_OMP_MAP_FROM(__first, __n);
return __first + __n;
}
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
index 6611be3b7a44833..06348b0fc5c2a5b 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
@@ -30,25 +30,25 @@ _LIBCPP_BEGIN_NAMESPACE_STD
template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
_LIBCPP_HIDE_FROM_ABI _Iterator2
__simd_walk_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
- _PSTL_OMP_MAP_TO(__first1,__n);
- _PSTL_OMP_MAP_ALLOC(__first2,__n);
+ _PSTL_OMP_MAP_TO(__first1, __n);
+ _PSTL_OMP_MAP_ALLOC(__first2, __n);
_PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__first1[__i], __first2[__i]);
- _PSTL_OMP_MAP_FROM(__first2,__n);
+ _PSTL_OMP_MAP_FROM(__first2, __n);
return __first2 + __n;
}
/**
* Specialization for std::vector where the base pointer must be extrated to map
* the data to and from the GPU.
-*/
+ */
template <typename T1, class _DifferenceType, typename T2, class _Function>
-_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T1*>
-__simd_walk_2(std::__wrap_iter<T1*> __first1, _DifferenceType __n, std::__wrap_iter<T2*> __first2, _Function __f) noexcept {
- _PSTL_OMP_MAP_TO(__first1,__n);
- _PSTL_OMP_MAP_ALLOC(__first2,__n);
+_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T1*> __simd_walk_2(
+ std::__wrap_iter<T1*> __first1, _DifferenceType __n, std::__wrap_iter<T2*> __first2, _Function __f) noexcept {
+ _PSTL_OMP_MAP_TO(__first1, __n);
+ _PSTL_OMP_MAP_ALLOC(__first2, __n);
std::pointer_traits<std::__wrap_iter<T1*>> PT1;
std::pointer_traits<std::__wrap_iter<T2*>> PT2;
T1* __data1 = PT1.to_address(__first1);
@@ -56,7 +56,7 @@ __simd_walk_2(std::__wrap_iter<T1*> __first1, _DifferenceType __n, std::__wrap_i
_PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__data1[__i], __data2[__i]);
- _PSTL_OMP_MAP_FROM(__first2,__n);
+ _PSTL_OMP_MAP_FROM(__first2, __n);
return __first2 + __n;
}
>From 22e51f1ca279908be7c595036546a06a3ef123a2 Mon Sep 17 00:00:00 2001
From: antonrydahl <rydahl2610 at gmail.com>
Date: Mon, 18 Sep 2023 13:11:33 -0700
Subject: [PATCH 3/4] Updated OpenMP data mapping to be conservative unless
_LIBCPP_OPENMP_OFFLOAD_MAPPED is defined
---
libcxx/include/__config | 28 +++++++++++++++++++++++++---
1 file changed, 25 insertions(+), 3 deletions(-)
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 087c5c91235511d..f4e5c511cdf6771 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1422,7 +1422,7 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
#include <__iterator/wrap_iter.h>
# define _PSTL_PRAGMA_DATA_MAP_TO(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(to:NAME[:LEN]))
# define _PSTL_PRAGMA_DATA_MAP_ALLOC(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(alloc:NAME[:LEN]))
-# define _PSTL_PRAGMA_DATA_MAP_FROM(NAME,LEN) _PSTL_PRAGMA(omp target update from(NAME[:LEN]))
+# define _PSTL_PRAGMA_DATA_MAP_FROM(NAME,LEN) _PSTL_PRAGMA(omp target exit data map(from:NAME[:LEN]))
template <typename N>
bool constexpr OMPIsOffloadable(N size)
@@ -1440,9 +1440,11 @@ void inline OMPMapToIf(T data,N length,int device = omp_get_default_device())
{
// If the data is already present on the device, there is no need
// transfer the data again.
+#ifdef _LIBCPP_OPENMP_OFFLOAD_MAPPED
if (omp_target_is_present(data,device)){
return;
}
+#endif
// If it is a small amount of data it does not make sense to offload to a
// device
if (!OMPIsOffloadable(length)){
@@ -1456,9 +1458,11 @@ void inline OMPMapAllocIf(T data,N length,int device = omp_get_default_device())
{
// If the data is already present on the device, there is no need
// transfer the data again.
+#ifdef _LIBCPP_OPENMP_OFFLOAD_MAPPED
if (omp_target_is_present(data,device)){
return;
}
+#endif
// If it is a small amount of data it does not make sense to offload to a
// device
if (!OMPIsOffloadable(length)){
@@ -1467,6 +1471,24 @@ void inline OMPMapAllocIf(T data,N length,int device = omp_get_default_device())
_PSTL_PRAGMA_DATA_MAP_ALLOC(data,length);
}
+template <typename T, typename N>
+void inline OMPMapFromIf(T data,N length,int device = omp_get_default_device())
+{
+ // If the data is already present on the device, there is no need
+ // transfer the data again.
+#ifdef _LIBCPP_OPENMP_OFFLOAD_MAPPED
+ if (omp_target_is_present(data,device)){
+ return;
+ }
+#endif
+ // If it is a small amount of data it does not make sense to offload to a
+ // device
+ if (!OMPIsOffloadable(length)){
+ return;
+ }
+ _PSTL_PRAGMA_DATA_MAP_FROM(data,length);
+}
+
template <typename T, typename N>
void inline OMPMapTo(T data,N length) {
OMPMapToIf(data,length);
@@ -1501,7 +1523,7 @@ void inline OMPMapAlloc(std::__wrap_iter<T*> w,N length) {
template <typename T,typename N>
void inline OMPMapFrom(T data,N length) {
- _PSTL_PRAGMA_DATA_MAP_FROM(data,length);
+ OMPMapFromIf(data,length);
}
/**
@@ -1512,7 +1534,7 @@ template <typename T,typename N>
void inline OMPMapFrom(std::__wrap_iter<T*> w,N length) {
std::pointer_traits<std::__wrap_iter<T*>> PT;
T* data = PT.to_address(w);
- _PSTL_PRAGMA_DATA_MAP_FROM(data,length);
+ OMPMapFromIf(data,length);
}
# define _PSTL_OMP_MAP_TO(DATA,LEN) OMPMapTo(DATA,LEN)
# define _PSTL_OMP_MAP_ALLOC(DATA,LEN) OMPMapAlloc(DATA,LEN)
>From 70631edaac5bba8c798d009ae2de3ec62f9e41e5 Mon Sep 17 00:00:00 2001
From: antonrydahl <rydahl2610 at gmail.com>
Date: Wed, 20 Sep 2023 10:28:58 -0700
Subject: [PATCH 4/4] Offloading version of transform with three iterator
inputs
---
.../pstl_backends/cpu_backends/fill.h | 26 +++++++++++++-
.../pstl_backends/cpu_backends/transform.h | 36 ++++++++++++++++++-
2 files changed, 60 insertions(+), 2 deletions(-)
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
index 7f6687306970364..b7e12adec1e8133 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
@@ -27,9 +27,33 @@ _LIBCPP_BEGIN_NAMESPACE_STD
template <class _Index, class _DifferenceType, class _Tp>
_LIBCPP_HIDE_FROM_ABI _Index __simd_fill_n(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
_PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
- _PSTL_PRAGMA_SIMD()
+ _PSTL_OMP_MAP_TO(__first, __n);
+# pragma omp target enter data map(to : __value)
+ _PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__first[__i] = __value;
+ _PSTL_OMP_MAP_FROM(__first, __n);
+ return __first + __n;
+}
+
+/**
+ * Specialization for std::vector where the base pointer must be extrated to map
+ * the data to and from the GPU.
+ */
+
+template <typename T, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T*>
+__simd_fill_n(std::__wrap_iter<T*> __first, _DifferenceType __n, const _Tp& __value) noexcept {
+ _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
+ _PSTL_OMP_MAP_TO(__first, __n);
+ // For std::vector the base pointer of the data buffer needs to be extracted
+ std::pointer_traits<std::__wrap_iter<T*>> PT;
+ T* data = PT.to_address(__first);
+# pragma omp target enter data map(to : __value)
+ _PSTL_PRAGMA_SIMD(__n)
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ data[__i] = __value;
+ _PSTL_OMP_MAP_FROM(__first, __n);
return __first + __n;
}
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
index 06348b0fc5c2a5b..e6cd70b5a420bef 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
@@ -96,9 +96,43 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
_LIBCPP_HIDE_FROM_ABI _Iterator3 __simd_walk_3(
_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
- _PSTL_PRAGMA_SIMD()
+ _PSTL_OMP_MAP_TO(__first1, __n);
+ _PSTL_OMP_MAP_TO(__first2, __n);
+ _PSTL_OMP_MAP_TO(__first3, __n);
+ _PSTL_PRAGMA_SIMD(__n)
for (_DifferenceType __i = 0; __i < __n; ++__i)
__f(__first1[__i], __first2[__i], __first3[__i]);
+ _PSTL_OMP_MAP_FROM(__first2, __n);
+ _PSTL_OMP_MAP_FROM(__first3, __n);
+ return __first3 + __n;
+}
+
+/**
+ * Specialization for std::vector where the base pointer must be extrated to map
+ * the data to and from the GPU.
+ */
+
+template <typename T1, class _DifferenceType, typename T2, typename T3, class _Function>
+_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T3*>
+__simd_walk_3(std::__wrap_iter<T1*> __first1,
+ _DifferenceType __n,
+ std::__wrap_iter<T2*> __first2,
+ std::__wrap_iter<T3*> __first3,
+ _Function __f) noexcept {
+ _PSTL_OMP_MAP_TO(__first1, __n);
+ _PSTL_OMP_MAP_TO(__first2, __n);
+ _PSTL_OMP_MAP_TO(__first3, __n);
+ std::pointer_traits<std::__wrap_iter<T1*>> PT1;
+ std::pointer_traits<std::__wrap_iter<T2*>> PT2;
+ std::pointer_traits<std::__wrap_iter<T3*>> PT3;
+ T1* __data1 = PT1.to_address(__first1);
+ T2* __data2 = PT2.to_address(__first2);
+ T3* __data3 = PT3.to_address(__first3);
+ _PSTL_PRAGMA_SIMD(__n)
+ for (_DifferenceType __i = 0; __i < __n; ++__i)
+ __f(__data1[__i], __data2[__i], __data3[__i]);
+ _PSTL_OMP_MAP_FROM(__first2, __n);
+ _PSTL_OMP_MAP_FROM(__first3, __n);
return __first3 + __n;
}
template <class _ExecutionPolicy,
More information about the libcxx-commits
mailing list