[llvm] [openmp] Reapply "[OpenMP][OMPX] Add shfl_down_sync (#93311)" (PR #94139)

Mon Jun 3 08:15:46 PDT 2024

================
@@ -0,0 +1,66 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+//
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+#include <ompx.h>
+#include <type_traits>
+
+template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
+bool equal(T LHS, T RHS) {
+  return LHS == RHS;
+}
+
+template <typename T,
+          std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
+bool equal(T LHS, T RHS) {
+  return std::abs(LHS - RHS) < std::numeric_limits<T>::epsilon();
+}
+
+template <typename T> void test() {
+  constexpr const int num_blocks = 1;
+  constexpr const int block_size = 256;
+  constexpr const int N = num_blocks * block_size;
+  int *res = new int[N];
+
+#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) \
+        map(from: res[0:N])
+  {
+    int tid = ompx_thread_id_x();
+    T val = ompx::shfl_down_sync(~0U, static_cast<T>(tid), 1);
+#ifdef __AMDGCN_WAVEFRONT_SIZE
+    int warp_size = __AMDGCN_WAVEFRONT_SIZE;
+#else
+    int warp_size = 32;
+#endif
----------------
shiltian wrote:

Well, the assumption is, we always define this macro for AMDGPUs when compiling device code.

https://github.com/llvm/llvm-project/pull/94139