[libcxx-commits] [libcxx] [RFC] Offloading C++ standard parallel algorithms to GPUs using OpenMP (PR #66465)

Thu Sep 14 23:55:22 PDT 2023

https://github.com/AntonRydahl created https://github.com/llvm/llvm-project/pull/66465

This draft pull request is meant for a request for comments posted on discourse.llvm.org.

The overall idea is to show that we could potentially offload C++ standard parallel algorithms to GPUs like the NVIDIA compilers but using OpenMP target offloading. It has been suggested to use the AMD implementation, but I think it would make a simultaneous attempt to reuse the implementations from `libcxx`. 

I have made another small Gihub [repository](https://github.com/AntonRydahl/ompstdpar) with the tests I have used to verify the implementation. The repository also contains temporary files if you want to dive into what code is actually generated. 

The tests in the aforementioned repository use the standard library array and vector templates. Ideally, it should also be tested with other standard library templates and more complicated lambdas.

>From f5f6b3ff15a2e26656221950e8e3fbe85766a0ef Mon Sep 17 00:00:00 2001
From: antonrydahl <rydahl2610 at gmail.com>
Date: Thu, 14 Sep 2023 23:43:43 -0700
Subject: [PATCH] Proof of concept for offloading C++ standard parallel
 algorithms to the GPU using OpenMP. To enable offloading, compile with
 -D_LIBCPP_ENABLE_OPENMP_OFFLOAD

---
 .../pstl_backends/cpu_backends/fill.h         |   2 +-
 .../pstl_backends/cpu_backends/for_each.h     |  21 ++-
 .../pstl_backends/cpu_backends/transform.h    |  28 +++-
 .../cpu_backends/transform_reduce.h           |   8 +-
 libcxx/include/__config                       | 121 +++++++++++++++++-
 5 files changed, 169 insertions(+), 11 deletions(-)

diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
index 8b531887c731808..7f6687306970364 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
@@ -27,7 +27,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Index, class _DifferenceType, class _Tp>
 _LIBCPP_HIDE_FROM_ABI _Index __simd_fill_n(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
   _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
-  _PSTL_PRAGMA_SIMD
+  _PSTL_PRAGMA_SIMD()
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __first[__i] = __value;
   return __first + __n;
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
index f6f22fdd8713ced..1155c070496c87c 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
@@ -26,10 +26,29 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Iterator, class _DifferenceType, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator __simd_walk_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
-  _PSTL_PRAGMA_SIMD
+  _PSTL_OMP_MAP_TO(__first,__n);
+  _PSTL_PRAGMA_SIMD(__n)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first[__i]);
+  _PSTL_OMP_MAP_FROM(__first,__n);
+  return __first + __n;
+}
 
+/**
+ * Specialization for std::vector where the base pointer must be extrated to map
+ * the data to and from the GPU.
+*/
+
+template <typename T, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T*> __simd_walk_1(std::__wrap_iter<T*> __first, _DifferenceType __n, _Function __f) noexcept {
+  _PSTL_OMP_MAP_TO(__first,__n);
+  // For std::vector the base pointer of the data buffer needs to be extracted 
+  std::pointer_traits<std::__wrap_iter<T*>> PT;
+  T* data = PT.to_address(__first);
+  _PSTL_PRAGMA_SIMD(__n)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(data[__i]);
+  _PSTL_OMP_MAP_FROM(__first,__n);
   return __first + __n;
 }
 
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
index 0259d8a84bb3f76..6611be3b7a44833 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
@@ -30,9 +30,33 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator2
 __simd_walk_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept {
-  _PSTL_PRAGMA_SIMD
+  _PSTL_OMP_MAP_TO(__first1,__n);
+  _PSTL_OMP_MAP_ALLOC(__first2,__n);
+  _PSTL_PRAGMA_SIMD(__n)
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first1[__i], __first2[__i]);
+  _PSTL_OMP_MAP_FROM(__first2,__n);
+  return __first2 + __n;
+}
+
+/**
+ * Specialization for std::vector where the base pointer must be extrated to map
+ * the data to and from the GPU.
+*/
+
+template <typename T1, class _DifferenceType, typename T2, class _Function>
+_LIBCPP_HIDE_FROM_ABI std::__wrap_iter<T1*>
+__simd_walk_2(std::__wrap_iter<T1*> __first1, _DifferenceType __n, std::__wrap_iter<T2*> __first2, _Function __f) noexcept {
+  _PSTL_OMP_MAP_TO(__first1,__n);
+  _PSTL_OMP_MAP_ALLOC(__first2,__n);
+  std::pointer_traits<std::__wrap_iter<T1*>> PT1;
+  std::pointer_traits<std::__wrap_iter<T2*>> PT2;
+  T1* __data1 = PT1.to_address(__first1);
+  T2* __data2 = PT2.to_address(__first2);
+  _PSTL_PRAGMA_SIMD(__n)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__data1[__i], __data2[__i]);
+  _PSTL_OMP_MAP_FROM(__first2,__n);
   return __first2 + __n;
 }
 
@@ -72,7 +96,7 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform(
 template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
 _LIBCPP_HIDE_FROM_ABI _Iterator3 __simd_walk_3(
     _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept {
-  _PSTL_PRAGMA_SIMD
+  _PSTL_PRAGMA_SIMD()
   for (_DifferenceType __i = 0; __i < __n; ++__i)
     __f(__first1[__i], __first2[__i], __first3[__i]);
   return __first3 + __n;
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
index 2afe5c7d10483fe..885648982409cb9 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
@@ -57,7 +57,7 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un
     _Tp* __lane = reinterpret_cast<_Tp*>(__lane_buffer);
 
     // initializer
-    _PSTL_PRAGMA_SIMD
+    _PSTL_PRAGMA_SIMD()
     for (_Size __i = 0; __i < __block_size; ++__i) {
       ::new (__lane + __i) _Tp(__binary_op(__f(__i), __f(__block_size + __i)));
     }
@@ -65,13 +65,13 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un
     _Size __i                    = 2 * __block_size;
     const _Size __last_iteration = __block_size * (__n / __block_size);
     for (; __i < __last_iteration; __i += __block_size) {
-      _PSTL_PRAGMA_SIMD
+      _PSTL_PRAGMA_SIMD()
       for (_Size __j = 0; __j < __block_size; ++__j) {
         __lane[__j] = __binary_op(std::move(__lane[__j]), __f(__i + __j));
       }
     }
     // remainder
-    _PSTL_PRAGMA_SIMD
+    _PSTL_PRAGMA_SIMD()
     for (_Size __j = 0; __j < __n - __last_iteration; ++__j) {
       __lane[__j] = __binary_op(std::move(__lane[__j]), __f(__last_iteration + __j));
     }
@@ -80,7 +80,7 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un
       __init = __binary_op(std::move(__init), std::move(__lane[__j]));
     }
     // destroyer
-    _PSTL_PRAGMA_SIMD
+    _PSTL_PRAGMA_SIMD()
     for (_Size __j = 0; __j < __block_size; ++__j) {
       __lane[__j].~_Tp();
     }
diff --git a/libcxx/include/__config b/libcxx/include/__config
index bf2564e2732ba98..087c5c91235511d 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1414,8 +1414,118 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 // Enable SIMD for compilers that support OpenMP 4.0
 #  if (defined(_OPENMP) && _OPENMP >= 201307)
 
+#    ifdef _LIBCPP_ENABLE_OPENMP_OFFLOAD
+#      ifndef _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES
+#        define _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES 32768
+#      endif
+#include <omp.h>
+#include <__iterator/wrap_iter.h>
+#      define _PSTL_PRAGMA_DATA_MAP_TO(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(to:NAME[:LEN]))
+#      define _PSTL_PRAGMA_DATA_MAP_ALLOC(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(alloc:NAME[:LEN]))
+#      define _PSTL_PRAGMA_DATA_MAP_FROM(NAME,LEN) _PSTL_PRAGMA(omp target update from(NAME[:LEN]))
+
+template <typename N>
+bool constexpr OMPIsOffloadable(N size)
+{
+      return size >= _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES;
+}
+
+bool constexpr OMPIsOffloadable(void)
+{
+      return false;
+}
+
+template <typename T, typename N>
+void inline OMPMapToIf(T data,N length,int device = omp_get_default_device())
+{
+      // If the data is already present on the device, there is no need
+      // transfer the data again.
+      if (omp_target_is_present(data,device)){
+            return;
+      }
+      // If it is a small amount of data it does not make sense to offload to a
+      // device
+      if (!OMPIsOffloadable(length)){
+            return;
+      }
+      _PSTL_PRAGMA_DATA_MAP_TO(data,length);
+}
+
+template <typename T, typename N>
+void inline OMPMapAllocIf(T data,N length,int device = omp_get_default_device())
+{
+      // If the data is already present on the device, there is no need
+      // transfer the data again.
+      if (omp_target_is_present(data,device)){
+            return;
+      }
+      // If it is a small amount of data it does not make sense to offload to a
+      // device
+      if (!OMPIsOffloadable(length)){
+            return;
+      }
+      _PSTL_PRAGMA_DATA_MAP_ALLOC(data,length);
+}
+
+template <typename T, typename N>
+void inline OMPMapTo(T data,N length) {
+  OMPMapToIf(data,length);
+}
+
+/**
+ * Specialization for std::vector
+*/
+
+template <typename T,typename N>
+void inline OMPMapTo(std::__wrap_iter<T*> w,N length) {
+  std::pointer_traits<std::__wrap_iter<T*>> PT;
+  T* data = PT.to_address(w);
+  OMPMapToIf(data,length);
+}
+
+template <typename T, typename N>
+void inline OMPMapAlloc(T data,N length) {
+  OMPMapAllocIf(data,length);
+}
+
+/**
+ * Specialization for std::vector
+*/
+
+template <typename T,typename N>
+void inline OMPMapAlloc(std::__wrap_iter<T*> w,N length) {
+  std::pointer_traits<std::__wrap_iter<T*>> PT;
+  T* data = PT.to_address(w);
+  OMPMapAllocIf(data,length);
+}
+
+template <typename T,typename N>
+void inline OMPMapFrom(T data,N length) {
+  _PSTL_PRAGMA_DATA_MAP_FROM(data,length);
+}
+
+/**
+ * Specialization for std::vector
+*/
+
+template <typename T,typename N>
+void inline OMPMapFrom(std::__wrap_iter<T*> w,N length) {
+  std::pointer_traits<std::__wrap_iter<T*>> PT;
+  T* data = PT.to_address(w);
+  _PSTL_PRAGMA_DATA_MAP_FROM(data,length);
+}
+#      define _PSTL_OMP_MAP_TO(DATA,LEN) OMPMapTo(DATA,LEN)
+#      define _PSTL_OMP_MAP_ALLOC(DATA,LEN) OMPMapAlloc(DATA,LEN)
+#      define _PSTL_OMP_MAP_FROM(DATA,LEN) OMPMapFrom(DATA,LEN)
+#      define _PSTL_PRAGMA_SIMD(...) _PSTL_PRAGMA(omp target teams distribute parallel for simd if(OMPIsOffloadable(__VA_ARGS__)))
+#    else
+#      define _PSTL_PRAGMA_SIMD(...) _PSTL_PRAGMA(omp simd)
+#      define _PSTL_OMP_MAP_TO(DATA,LEN)
+#      define _PSTL_OMP_MAP_ALLOC(DATA,LEN)
+#      define _PSTL_OMP_MAP_FROM(DATA,LEN)
+#    endif
+
 #    define _PSTL_UDR_PRESENT
-#    define _PSTL_PRAGMA_SIMD _PSTL_PRAGMA(omp simd)
 #    define _PSTL_PRAGMA_DECLARE_SIMD _PSTL_PRAGMA(omp declare simd)
 #    define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) _PSTL_PRAGMA(omp simd reduction(PRM))
 #    define _PSTL_PRAGMA_SIMD_SCAN(PRM) _PSTL_PRAGMA(omp simd reduction(inscan, PRM))
@@ -1434,7 +1544,7 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 
 #  elif defined(_LIBCPP_COMPILER_CLANG_BASED)
 
-#    define _PSTL_PRAGMA_SIMD _Pragma("clang loop vectorize(enable) interleave(enable)")
+#    define _PSTL_PRAGMA_SIMD(...) _Pragma("clang loop vectorize(enable) interleave(enable)")
 #    define _PSTL_PRAGMA_DECLARE_SIMD
 #    define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) _Pragma("clang loop vectorize(enable) interleave(enable)")
 #    define _PSTL_PRAGMA_SIMD_SCAN(PRM) _Pragma("clang loop vectorize(enable) interleave(enable)")
@@ -1444,7 +1554,7 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 
 #  else // (defined(_OPENMP) && _OPENMP >= 201307)
 
-#    define _PSTL_PRAGMA_SIMD
+#    define _PSTL_PRAGMA_SIMD(...)
 #    define _PSTL_PRAGMA_DECLARE_SIMD
 #    define _PSTL_PRAGMA_SIMD_REDUCTION(PRM)
 #    define _PSTL_PRAGMA_SIMD_SCAN(PRM)
@@ -1454,6 +1564,11 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 
 #  endif // (defined(_OPENMP) && _OPENMP >= 201307)
 
+#  ifndef _LIBCPP_ENABLE_OPENMP_OFFLOAD
+#  define _PSTL_OMP_MAP_TO(...)
+#  define _PSTL_OMP_MAP_FROM(...)
+#  endif 
+
 #  define _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
 
 #endif // __cplusplus