[llvm] [OpenMP] Remove use of '__AMDGCN_WAVEFRONT_SIZE' (PR #113156)

Mon Oct 21 04:56:44 PDT 2024

https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/113156

Summary:
This is going to be deprecated in
https://github.com/llvm/llvm-project/pull/112849. This patch ports it to
use the builtin instead. This isn't a compile constant, so it could
slightly negatively affect codegen. There really should be an IR pass to
turn it into a constant if the function has known attributes.

Using the builtin is correct when we just do it for knowing the size
like we do here. Obviously guarding w32/w64 code with this check would
be broken.


>From 80e8268d4fb34d82a18aa10daf4a11253b33467d Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 21 Oct 2024 06:54:18 -0500
Subject: [PATCH] [OpenMP] Remove use of '__AMDGCN_WAVEFRONT_SIZE'

Summary:
This is going to be deprecated in
https://github.com/llvm/llvm-project/pull/112849. This patch ports it to
use the builtin instead. This isn't a compile constant, so it could
slightly negatively affect codegen. There really should be an IR pass to
turn it into a constant if the function has known attributes.

Using the builtin is correct when we just do it for knowing the size
like we do here. Obviously guarding w32/w64 code with this check would
be broken.
---
 offload/DeviceRTL/src/Mapping.cpp             | 10 +++-----
 .../test/offloading/ompx_bare_ballot_sync.c   | 25 +++++++++++++------
 .../offloading/ompx_bare_shfl_down_sync.cpp   | 18 +++++++++----
 3 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 3aefcff68e1956..881bd12f034051 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -25,7 +25,6 @@ namespace ompx {
 namespace impl {
 
 // Forward declarations defined to be defined for AMDGCN and NVPTX.
-const llvm::omp::GV &getGridValue();
 LaneMaskTy activemask();
 LaneMaskTy lanemaskLT();
 LaneMaskTy lanemaskGT();
@@ -37,15 +36,14 @@ uint32_t getBlockIdInKernel(int32_t Dim);
 uint32_t getNumberOfBlocksInKernel(int32_t Dim);
 uint32_t getWarpIdInBlock();
 uint32_t getNumberOfWarpsInBlock();
+uint32_t getWarpSize();
 
 /// AMDGCN Implementation
 ///
 ///{
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
-const llvm::omp::GV &getGridValue() {
-  return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
-}
+uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
 
 uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
   switch (Dim) {
@@ -152,7 +150,7 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
   UNREACHABLE("Dim outside range!");
 }
 
-const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
+uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
 
 LaneMaskTy activemask() { return __nvvm_activemask(); }
 
@@ -219,8 +217,6 @@ uint32_t getNumberOfWarpsInBlock() {
 #pragma omp end declare variant
 ///}
 
-uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
-
 } // namespace impl
 } // namespace ompx
 
diff --git a/offload/test/offloading/ompx_bare_ballot_sync.c b/offload/test/offloading/ompx_bare_ballot_sync.c
index 101d1255f0d670..b810fb404b58f6 100644
--- a/offload/test/offloading/ompx_bare_ballot_sync.c
+++ b/offload/test/offloading/ompx_bare_ballot_sync.c
@@ -8,22 +8,33 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+unsigned get_warp_size() { return 1; }
+#pragma omp end declare variant
+
 int main(int argc, char *argv[]) {
   const int num_blocks = 1;
   const int block_size = 256;
   const int N = num_blocks * block_size;
   int *res = (int *)malloc(N * sizeof(int));
 
-#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) \
-        map(from: res[0:N])
+#pragma omp target teams ompx_bare num_teams(num_blocks)                       \
+    thread_limit(block_size) map(from : res[0 : N])
   {
     int tid = ompx_thread_id_x();
     uint64_t mask = ompx_ballot_sync(~0LU, tid & 0x1);
-#if defined __AMDGCN_WAVEFRONT_SIZE && __AMDGCN_WAVEFRONT_SIZE == 64
-    res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
-#else
-    res[tid] = mask == 0xaaaaaaaa;
-#endif
+    if (get_warp_size() == 64)
+      res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
+    else
+      res[tid] = mask == 0xaaaaaaaa;
   }
 
   for (int i = 0; i < N; ++i)
diff --git a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
index 9b0e66e25f68c9..311999918de857 100644
--- a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
+++ b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
@@ -10,6 +10,18 @@
 #include <ompx.h>
 #include <type_traits>
 
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+unsigned get_warp_size() { return 1; }
+#pragma omp end declare variant
+
 template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
 bool equal(T LHS, T RHS) {
   return LHS == RHS;
@@ -32,11 +44,7 @@ template <typename T> void test() {
   {
     int tid = ompx_thread_id_x();
     T val = ompx::shfl_down_sync(~0U, static_cast<T>(tid), 1);
-#ifdef __AMDGCN_WAVEFRONT_SIZE
-    int warp_size = __AMDGCN_WAVEFRONT_SIZE;
-#else
-    int warp_size = 32;
-#endif
+    int warp_size = get_warp_size();
     if ((tid & (warp_size - 1)) != warp_size - 1)
       res[tid] = equal(val, static_cast<T>(tid + 1));
     else