[libcxx-commits] [libcxx] [RFC] Offloading C++ standard parallel algorithms to GPUs using OpenMP (PR #66465)

Anton Rydahl via libcxx-commits libcxx-commits at lists.llvm.org
Fri Sep 15 10:29:31 PDT 2023


https://github.com/AntonRydahl updated https://github.com/llvm/llvm-project/pull/66465

>From f5f6b3ff15a2e26656221950e8e3fbe85766a0ef Mon Sep 17 00:00:00 2001
From: antonrydahl <rydahl2610 at gmail.com>
Date: Thu, 14 Sep 2023 23:43:43 -0700
Subject: [PATCH 1/2] Proof of concept for offloading C++ standard parallel
 algorithms to the GPU using OpenMP. To enable offloading, compile with
 -D_LIBCPP_ENABLE_OPENMP_OFFLOAD

---
 .../pstl_backends/cpu_backends/fill.h         |   2 +-
 .../pstl_backends/cpu_backends/for_each.h     |  21 ++-
 .../pstl_backends/cpu_backends/transform.h    |  28 +++-
 .../cpu_backends/transform_reduce.h           |   8 +-
 libcxx/include/__config                       | 121 +++++++++++++++++-
 5 files changed, 169 insertions(+), 11 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
index 8b531887c731808..7f6687306970364 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
@@ -27,7 +27,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Index, class _DifferenceType, class _Tp>
 _LIBCPP_HIDE_FROM_ABI _Index __simd_fill_n(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
   _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
-  _PSTL_PRAGMA_SIMD
+  _PSTL_PRAGMA_SIMD()
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __first[__i] = __value;
   return __first + __n;
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
index f6f22fdd8713ced..1155c070496c87c 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
@@ -26,10 +26,29 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Iterator, class _DifferenceType, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator __simd_walk_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-  _PSTL_PRAGMA_SIMD
+  _PSTL_OMP_MAP_TO(__first,__n);
+  _PSTL_PRAGMA_SIMD(__n)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
+  _PSTL_OMP_MAP_FROM(__first,__n);
+  return __first + __n;
+}
 
+/**
+ * Specialization for std::vector where the base pointer must be extrated to map
+ * the data to and from the GPU.
+*/
+
+template <typename T, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T*> __simd_walk_1(std::__wrap_iter<T*> __first, _DifferenceType __n, _Function __f) noexcept {
+  _PSTL_OMP_MAP_TO(__first,__n);
+  // For std::vector the base pointer of the data buffer needs to be extracted 
+  std::pointer_traits<std::__wrap_iter<T*>> PT;
+  T* data = PT.to_address(__first);
+  _PSTL_PRAGMA_SIMD(__n)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(data[__i]);
+  _PSTL_OMP_MAP_FROM(__first,__n);
   return __first + __n;
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
index 0259d8a84bb3f76..6611be3b7a44833 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
@@ -30,9 +30,33 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator2
 __simd_walk_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
-  _PSTL_PRAGMA_SIMD
+  _PSTL_OMP_MAP_TO(__first1,__n);
+  _PSTL_OMP_MAP_ALLOC(__first2,__n);
+  _PSTL_PRAGMA_SIMD(__n)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first1[__i], __first2[__i]);
+  _PSTL_OMP_MAP_FROM(__first2,__n);
+  return __first2 + __n;
+}
+
+/**
+ * Specialization for std::vector where the base pointer must be extrated to map
+ * the data to and from the GPU.
+*/
+
+template <typename T1, class _DifferenceType, typename T2, class _Function>
+_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T1*>
+__simd_walk_2(std::__wrap_iter<T1*> __first1, _DifferenceType __n, std::__wrap_iter<T2*> __first2, _Function __f) noexcept {
+  _PSTL_OMP_MAP_TO(__first1,__n);
+  _PSTL_OMP_MAP_ALLOC(__first2,__n);
+  std::pointer_traits<std::__wrap_iter<T1*>> PT1;
+  std::pointer_traits<std::__wrap_iter<T2*>> PT2;
+  T1* __data1 = PT1.to_address(__first1);
+  T2* __data2 = PT2.to_address(__first2);
+  _PSTL_PRAGMA_SIMD(__n)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__data1[__i], __data2[__i]);
+  _PSTL_OMP_MAP_FROM(__first2,__n);
   return __first2 + __n;
 }
 
@@ -72,7 +96,7 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
 template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator3 __simd_walk_3(
     _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
-  _PSTL_PRAGMA_SIMD
+  _PSTL_PRAGMA_SIMD()
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first1[__i], __first2[__i], __first3[__i]);
   return __first3 + __n;
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
index 2afe5c7d10483fe..885648982409cb9 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
@@ -57,7 +57,7 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un
     _Tp* __lane = reinterpret_cast<_Tp*>(__lane_buffer);
 
     // initializer
-    _PSTL_PRAGMA_SIMD
+    _PSTL_PRAGMA_SIMD()
     for (_Size __i = 0; __i < __block_size; ++__i) {
       ::new (__lane + __i) _Tp(__binary_op(__f(__i), __f(__block_size + __i)));
     }
@@ -65,13 +65,13 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un
     _Size __i                    = 2 * __block_size;
     const _Size __last_iteration = __block_size * (__n / __block_size);
     for (; __i < __last_iteration; __i += __block_size) {
-      _PSTL_PRAGMA_SIMD
+      _PSTL_PRAGMA_SIMD()
       for (_Size __j = 0; __j < __block_size; ++__j) {
         __lane[__j] = __binary_op(std::move(__lane[__j]), __f(__i + __j));
       }
     }
     // remainder
-    _PSTL_PRAGMA_SIMD
+    _PSTL_PRAGMA_SIMD()
     for (_Size __j = 0; __j < __n - __last_iteration; ++__j) {
       __lane[__j] = __binary_op(std::move(__lane[__j]), __f(__last_iteration + __j));
     }
@@ -80,7 +80,7 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un
       __init = __binary_op(std::move(__init), std::move(__lane[__j]));
     }
     // destroyer
-    _PSTL_PRAGMA_SIMD
+    _PSTL_PRAGMA_SIMD()
     for (_Size __j = 0; __j < __block_size; ++__j) {
       __lane[__j].~_Tp();
     }
diff --git a/libcxx/include/__config b/libcxx/include/__config
index bf2564e2732ba98..087c5c91235511d 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1414,8 +1414,118 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 // Enable SIMD for compilers that support OpenMP 4.0
 #  if (defined(_OPENMP) && _OPENMP >= 201307)
 
+#    ifdef _LIBCPP_ENABLE_OPENMP_OFFLOAD
+#      ifndef _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES
+#        define _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES 32768
+#      endif
+#include <omp.h>
+#include <__iterator/wrap_iter.h>
+#      define _PSTL_PRAGMA_DATA_MAP_TO(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(to:NAME[:LEN]))
+#      define _PSTL_PRAGMA_DATA_MAP_ALLOC(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(alloc:NAME[:LEN]))
+#      define _PSTL_PRAGMA_DATA_MAP_FROM(NAME,LEN) _PSTL_PRAGMA(omp target update from(NAME[:LEN]))
+
+template <typename N>
+bool constexpr OMPIsOffloadable(N size)
+{
+      return size >= _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES;
+}
+
+bool constexpr OMPIsOffloadable(void)
+{
+      return false;
+}
+
+template <typename T, typename N>
+void inline OMPMapToIf(T data,N length,int device = omp_get_default_device())
+{
+      // If the data is already present on the device, there is no need
+      // transfer the data again.
+      if (omp_target_is_present(data,device)){
+            return;
+      }
+      // If it is a small amount of data it does not make sense to offload to a
+      // device
+      if (!OMPIsOffloadable(length)){
+            return;
+      }
+      _PSTL_PRAGMA_DATA_MAP_TO(data,length);
+}
+
+template <typename T, typename N>
+void inline OMPMapAllocIf(T data,N length,int device = omp_get_default_device())
+{
+      // If the data is already present on the device, there is no need
+      // transfer the data again.
+      if (omp_target_is_present(data,device)){
+            return;
+      }
+      // If it is a small amount of data it does not make sense to offload to a
+      // device
+      if (!OMPIsOffloadable(length)){
+            return;
+      }
+      _PSTL_PRAGMA_DATA_MAP_ALLOC(data,length);
+}
+
+template <typename T, typename N>
+void inline OMPMapTo(T data,N length) {
+  OMPMapToIf(data,length);
+}
+
+/**
+ * Specialization for std::vector
+*/
+
+template <typename T,typename N>
+void inline OMPMapTo(std::__wrap_iter<T*> w,N length) {
+  std::pointer_traits<std::__wrap_iter<T*>> PT;
+  T* data = PT.to_address(w);
+  OMPMapToIf(data,length);
+}
+
+template <typename T, typename N>
+void inline OMPMapAlloc(T data,N length) {
+  OMPMapAllocIf(data,length);
+}
+
+/**
+ * Specialization for std::vector
+*/
+
+template <typename T,typename N>
+void inline OMPMapAlloc(std::__wrap_iter<T*> w,N length) {
+  std::pointer_traits<std::__wrap_iter<T*>> PT;
+  T* data = PT.to_address(w);
+  OMPMapAllocIf(data,length);
+}
+
+template <typename T,typename N>
+void inline OMPMapFrom(T data,N length) {
+  _PSTL_PRAGMA_DATA_MAP_FROM(data,length);
+}
+
+/**
+ * Specialization for std::vector
+*/
+
+template <typename T,typename N>
+void inline OMPMapFrom(std::__wrap_iter<T*> w,N length) {
+  std::pointer_traits<std::__wrap_iter<T*>> PT;
+  T* data = PT.to_address(w);
+  _PSTL_PRAGMA_DATA_MAP_FROM(data,length);
+}
+#      define _PSTL_OMP_MAP_TO(DATA,LEN) OMPMapTo(DATA,LEN)
+#      define _PSTL_OMP_MAP_ALLOC(DATA,LEN) OMPMapAlloc(DATA,LEN)
+#      define _PSTL_OMP_MAP_FROM(DATA,LEN) OMPMapFrom(DATA,LEN)
+#      define _PSTL_PRAGMA_SIMD(...) _PSTL_PRAGMA(omp target teams distribute parallel for simd if(OMPIsOffloadable(__VA_ARGS__)))
+#    else
+#      define _PSTL_PRAGMA_SIMD(...) _PSTL_PRAGMA(omp simd)
+#      define _PSTL_OMP_MAP_TO(DATA,LEN)
+#      define _PSTL_OMP_MAP_ALLOC(DATA,LEN)
+#      define _PSTL_OMP_MAP_FROM(DATA,LEN)
+#    endif
+
 #    define _PSTL_UDR_PRESENT
-#    define _PSTL_PRAGMA_SIMD _PSTL_PRAGMA(omp simd)
 #    define _PSTL_PRAGMA_DECLARE_SIMD _PSTL_PRAGMA(omp declare simd)
 #    define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) _PSTL_PRAGMA(omp simd reduction(PRM))
 #    define _PSTL_PRAGMA_SIMD_SCAN(PRM) _PSTL_PRAGMA(omp simd reduction(inscan, PRM))
@@ -1434,7 +1544,7 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 
 #  elif defined(_LIBCPP_COMPILER_CLANG_BASED)
 
-#    define _PSTL_PRAGMA_SIMD _Pragma("clang loop vectorize(enable) interleave(enable)")
+#    define _PSTL_PRAGMA_SIMD(...) _Pragma("clang loop vectorize(enable) interleave(enable)")
 #    define _PSTL_PRAGMA_DECLARE_SIMD
 #    define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) _Pragma("clang loop vectorize(enable) interleave(enable)")
 #    define _PSTL_PRAGMA_SIMD_SCAN(PRM) _Pragma("clang loop vectorize(enable) interleave(enable)")
@@ -1444,7 +1554,7 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 
 #  else // (defined(_OPENMP) && _OPENMP >= 201307)
 
-#    define _PSTL_PRAGMA_SIMD
+#    define _PSTL_PRAGMA_SIMD(...)
 #    define _PSTL_PRAGMA_DECLARE_SIMD
 #    define _PSTL_PRAGMA_SIMD_REDUCTION(PRM)
 #    define _PSTL_PRAGMA_SIMD_SCAN(PRM)
@@ -1454,6 +1564,11 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 
 #  endif // (defined(_OPENMP) && _OPENMP >= 201307)
 
+#  ifndef _LIBCPP_ENABLE_OPENMP_OFFLOAD
+#  define _PSTL_OMP_MAP_TO(...)
+#  define _PSTL_OMP_MAP_FROM(...)
+#  endif 
+
 #  define _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
 
 #endif // __cplusplus

>From 897ee3a2029ee77ec56ac62d4f4e7a409e9308f8 Mon Sep 17 00:00:00 2001
From: antonrydahl <rydahl2610 at gmail.com>
Date: Fri, 15 Sep 2023 10:29:04 -0700
Subject: [PATCH 2/2] Clang-formatted for_Each.h and transform.h

---
 .../pstl_backends/cpu_backends/for_each.h      | 15 ++++++++-------
 .../pstl_backends/cpu_backends/transform.h     | 18 +++++++++---------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
index 1155c070496c87c..ed336766295cb07 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
@@ -26,29 +26,30 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Iterator, class _DifferenceType, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator __simd_walk_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-  _PSTL_OMP_MAP_TO(__first,__n);
+  _PSTL_OMP_MAP_TO(__first, __n);
   _PSTL_PRAGMA_SIMD(__n)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
-  _PSTL_OMP_MAP_FROM(__first,__n);
+  _PSTL_OMP_MAP_FROM(__first, __n);
   return __first + __n;
 }
 
 /**
  * Specialization for std::vector where the base pointer must be extrated to map
  * the data to and from the GPU.
-*/
+ */
 
 template <typename T, class _DifferenceType, class _Function>
-_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T*> __simd_walk_1(std::__wrap_iter<T*> __first, _DifferenceType __n, _Function __f) noexcept {
-  _PSTL_OMP_MAP_TO(__first,__n);
-  // For std::vector the base pointer of the data buffer needs to be extracted 
+_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T*>
+__simd_walk_1(std::__wrap_iter<T*> __first, _DifferenceType __n, _Function __f) noexcept {
+  _PSTL_OMP_MAP_TO(__first, __n);
+  // For std::vector the base pointer of the data buffer needs to be extracted
   std::pointer_traits<std::__wrap_iter<T*>> PT;
   T* data = PT.to_address(__first);
   _PSTL_PRAGMA_SIMD(__n)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(data[__i]);
-  _PSTL_OMP_MAP_FROM(__first,__n);
+  _PSTL_OMP_MAP_FROM(__first, __n);
   return __first + __n;
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
index 6611be3b7a44833..06348b0fc5c2a5b 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
@@ -30,25 +30,25 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator2
 __simd_walk_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
-  _PSTL_OMP_MAP_TO(__first1,__n);
-  _PSTL_OMP_MAP_ALLOC(__first2,__n);
+  _PSTL_OMP_MAP_TO(__first1, __n);
+  _PSTL_OMP_MAP_ALLOC(__first2, __n);
   _PSTL_PRAGMA_SIMD(__n)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first1[__i], __first2[__i]);
-  _PSTL_OMP_MAP_FROM(__first2,__n);
+  _PSTL_OMP_MAP_FROM(__first2, __n);
   return __first2 + __n;
 }
 
 /**
  * Specialization for std::vector where the base pointer must be extrated to map
  * the data to and from the GPU.
-*/
+ */
 
 template <typename T1, class _DifferenceType, typename T2, class _Function>
-_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T1*>
-__simd_walk_2(std::__wrap_iter<T1*> __first1, _DifferenceType __n, std::__wrap_iter<T2*> __first2, _Function __f) noexcept {
-  _PSTL_OMP_MAP_TO(__first1,__n);
-  _PSTL_OMP_MAP_ALLOC(__first2,__n);
+_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T1*> __simd_walk_2(
+    std::__wrap_iter<T1*> __first1, _DifferenceType __n, std::__wrap_iter<T2*> __first2, _Function __f) noexcept {
+  _PSTL_OMP_MAP_TO(__first1, __n);
+  _PSTL_OMP_MAP_ALLOC(__first2, __n);
   std::pointer_traits<std::__wrap_iter<T1*>> PT1;
   std::pointer_traits<std::__wrap_iter<T2*>> PT2;
   T1* __data1 = PT1.to_address(__first1);
@@ -56,7 +56,7 @@ __simd_walk_2(std::__wrap_iter<T1*> __first1, _DifferenceType __n, std::__wrap_i
   _PSTL_PRAGMA_SIMD(__n)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__data1[__i], __data2[__i]);
-  _PSTL_OMP_MAP_FROM(__first2,__n);
+  _PSTL_OMP_MAP_FROM(__first2, __n);
   return __first2 + __n;
 }
 



More information about the libcxx-commits mailing list