[llvm] [OpenMP] Remove use of '__AMDGCN_WAVEFRONT_SIZE' (PR #113156)
Joseph Huber via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 21 04:56:44 PDT 2024
https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/113156
Summary:
This is going to be deprecated in
https://github.com/llvm/llvm-project/pull/112849. This patch ports it to
use the builtin instead. This isn't a compile constant, so it could
slightly negatively affect codegen. There really should be an IR pass to
turn it into a constant if the function has known attributes.
Using the builtin is correct when we just do it for knowing the size
like we do here. Obviously guarding w32/w64 code with this check would
be broken.
>From 80e8268d4fb34d82a18aa10daf4a11253b33467d Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 21 Oct 2024 06:54:18 -0500
Subject: [PATCH] [OpenMP] Remove use of '__AMDGCN_WAVEFRONT_SIZE'
Summary:
This is going to be deprecated in
https://github.com/llvm/llvm-project/pull/112849. This patch ports it to
use the builtin instead. This isn't a compile constant, so it could
slightly negatively affect codegen. There really should be an IR pass to
turn it into a constant if the function has known attributes.
Using the builtin is correct when we just do it for knowing the size
like we do here. Obviously guarding w32/w64 code with this check would
be broken.
---
offload/DeviceRTL/src/Mapping.cpp | 10 +++-----
.../test/offloading/ompx_bare_ballot_sync.c | 25 +++++++++++++------
.../offloading/ompx_bare_shfl_down_sync.cpp | 18 +++++++++----
3 files changed, 34 insertions(+), 19 deletions(-)
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 3aefcff68e1956..881bd12f034051 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -25,7 +25,6 @@ namespace ompx {
namespace impl {
// Forward declarations defined to be defined for AMDGCN and NVPTX.
-const llvm::omp::GV &getGridValue();
LaneMaskTy activemask();
LaneMaskTy lanemaskLT();
LaneMaskTy lanemaskGT();
@@ -37,15 +36,14 @@ uint32_t getBlockIdInKernel(int32_t Dim);
uint32_t getNumberOfBlocksInKernel(int32_t Dim);
uint32_t getWarpIdInBlock();
uint32_t getNumberOfWarpsInBlock();
+uint32_t getWarpSize();
/// AMDGCN Implementation
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
-const llvm::omp::GV &getGridValue() {
- return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
-}
+uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
switch (Dim) {
@@ -152,7 +150,7 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
UNREACHABLE("Dim outside range!");
}
-const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
+uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
LaneMaskTy activemask() { return __nvvm_activemask(); }
@@ -219,8 +217,6 @@ uint32_t getNumberOfWarpsInBlock() {
#pragma omp end declare variant
///}
-uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
-
} // namespace impl
} // namespace ompx
diff --git a/offload/test/offloading/ompx_bare_ballot_sync.c b/offload/test/offloading/ompx_bare_ballot_sync.c
index 101d1255f0d670..b810fb404b58f6 100644
--- a/offload/test/offloading/ompx_bare_ballot_sync.c
+++ b/offload/test/offloading/ompx_bare_ballot_sync.c
@@ -8,22 +8,33 @@
#include <stdio.h>
#include <stdlib.h>
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+unsigned get_warp_size() { return 1; }
+#pragma omp end declare variant
+
int main(int argc, char *argv[]) {
const int num_blocks = 1;
const int block_size = 256;
const int N = num_blocks * block_size;
int *res = (int *)malloc(N * sizeof(int));
-#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) \
- map(from: res[0:N])
+#pragma omp target teams ompx_bare num_teams(num_blocks) \
+ thread_limit(block_size) map(from : res[0 : N])
{
int tid = ompx_thread_id_x();
uint64_t mask = ompx_ballot_sync(~0LU, tid & 0x1);
-#if defined __AMDGCN_WAVEFRONT_SIZE && __AMDGCN_WAVEFRONT_SIZE == 64
- res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
-#else
- res[tid] = mask == 0xaaaaaaaa;
-#endif
+ if (get_warp_size() == 64)
+ res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
+ else
+ res[tid] = mask == 0xaaaaaaaa;
}
for (int i = 0; i < N; ++i)
diff --git a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
index 9b0e66e25f68c9..311999918de857 100644
--- a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
+++ b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
@@ -10,6 +10,18 @@
#include <ompx.h>
#include <type_traits>
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+unsigned get_warp_size() { return 1; }
+#pragma omp end declare variant
+
template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
bool equal(T LHS, T RHS) {
return LHS == RHS;
@@ -32,11 +44,7 @@ template <typename T> void test() {
{
int tid = ompx_thread_id_x();
T val = ompx::shfl_down_sync(~0U, static_cast<T>(tid), 1);
-#ifdef __AMDGCN_WAVEFRONT_SIZE
- int warp_size = __AMDGCN_WAVEFRONT_SIZE;
-#else
- int warp_size = 32;
-#endif
+ int warp_size = get_warp_size();
if ((tid & (warp_size - 1)) != warp_size - 1)
res[tid] = equal(val, static_cast<T>(tid + 1));
else
More information about the llvm-commits
mailing list