[Openmp-commits] [llvm] [openmp] Reapply "[OpenMP][OMPX] Add shfl_down_sync (#93311)" (PR #94139)

Sat Jun 1 21:15:14 PDT 2024

================
@@ -0,0 +1,71 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+//
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+#include <ompx.h>
+#include <type_traits>
+
+template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
+bool equal(T LHS, T RHS) {
+  return LHS == RHS;
+}
+
+template <typename T,
+          std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
+bool equal(T LHS, T RHS) {
+  return std::abs(LHS - RHS) < std::numeric_limits<T>::epsilon();
+}
+
+template <typename T> void test() {
+  constexpr const int num_blocks = 1;
+  constexpr const int block_size = 256;
+  constexpr const int N = num_blocks * block_size;
+  T *data = new T[N];
+  int *res = new int[N];
+
+  for (int i = 0; i < N; ++i)
+    data[i] = i;
+
+#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) \
+        map(to: data[0:N]) map(from: res[0:N])
+  {
+    int tid = ompx_thread_id_x();
+    data[tid] = ompx::shfl_down_sync(~0U, data[tid], 1);
+#ifdef __AMDGCN_WAVEFRONT_SIZE
+    int warp_size = __AMDGCN_WAVEFRONT_SIZE;
+#else
+    int warp_size = 32;
+#endif
+    if ((tid & (warp_size - 1)) != warp_size - 1)
+      res[tid] = equal(data[tid], static_cast<T>(tid + 1));
+    else
+      res[tid] = equal(data[tid], static_cast<T>(tid));
----------------
shiltian wrote:

Since we don't have a reliable way to get the warp size on the host, I figure it is best to verify the result on device directly.

https://github.com/llvm/llvm-project/pull/94139