[llvm] [OpenMP] Replace utilities with 'gpuintrin.h' definitions (PR #131644)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 17 09:56:37 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-offload
Author: Joseph Huber (jhuber6)
<details>
<summary>Changes</summary>
- **[OpenMP] Use 'gpuintrin.h' definitions for simple block identifiers**
- **[OpenMP] Replace utilities with 'gpuintrin.h' definitions**
---
Full diff: https://github.com/llvm/llvm-project/pull/131644.diff
4 Files Affected:
- (modified) offload/DeviceRTL/include/Mapping.h (+3-3)
- (modified) offload/DeviceRTL/src/DeviceUtils.cpp (+15-84)
- (modified) offload/DeviceRTL/src/Mapping.cpp (+46-209)
- (modified) offload/DeviceRTL/src/Misc.cpp (+10-37)
``````````diff
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index f892a025159d4..8ba018b5314aa 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -19,9 +19,9 @@ namespace ompx {
namespace mapping {
enum {
- DIM_X = 0,
- DIM_Y = 1,
- DIM_Z = 2,
+ DIM_X = __GPU_X_DIM,
+ DIM_Y = __GPU_Y_DIM,
+ DIM_Z = __GPU_Z_DIM,
};
inline constexpr uint32_t MaxThreadsPerTeam = 1024;
diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
index d8109537832e9..d6f8c499c8904 100644
--- a/offload/DeviceRTL/src/DeviceUtils.cpp
+++ b/offload/DeviceRTL/src/DeviceUtils.cpp
@@ -14,117 +14,48 @@
#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"
+#include "gpuintrin.h"
using namespace ompx;
-namespace impl {
-
-void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
- static_assert(sizeof(unsigned long) == 8, "");
- *LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
- *HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
-}
-
-uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
- return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
-}
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
- int32_t Width);
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred);
-
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
- int Self = mapping::getThreadIdInWarp();
- int Index = SrcLane + (Self & ~(Width - 1));
- return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
-}
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
- int32_t Width) {
- int Self = mapping::getThreadIdInWarp();
- int Index = Self + LaneDelta;
- Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
- return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
-}
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
- return Mask & __builtin_amdgcn_ballot_w64(Pred);
-}
-
-bool isSharedMemPtr(const void *Ptr) {
- return __builtin_amdgcn_is_shared(
- (const __attribute__((address_space(0))) void *)Ptr);
-}
-#endif
-///}
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
- return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
-}
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
- int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
- return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-}
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
- return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
-}
-
-bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
-
-#endif
-///}
-} // namespace impl
-
uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
- return impl::Pack(LowBits, HighBits);
+ return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
}
void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
- impl::Unpack(Val, &LowBits, &HighBits);
+ static_assert(sizeof(unsigned long) == 8, "");
+ LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
+ HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
}
int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
int32_t Width) {
- return impl::shuffle(Mask, Var, SrcLane, Width);
+ return __gpu_shuffle_idx_u32(Mask, SrcLane, Var, Width);
}
int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
int32_t Width) {
- return impl::shuffleDown(Mask, Var, Delta, Width);
+ int32_t Self = mapping::getThreadIdInWarp();
+ int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
+ return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
}
int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
int32_t Width) {
- uint32_t Lo, Hi;
- utils::unpack(Var, Lo, Hi);
- Hi = impl::shuffleDown(Mask, Hi, Delta, Width);
- Lo = impl::shuffleDown(Mask, Lo, Delta, Width);
- return utils::pack(Lo, Hi);
+ int32_t Self = mapping::getThreadIdInWarp();
+ int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
+ return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
}
uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
- return impl::ballotSync(Mask, Pred);
+ return __gpu_ballot(Mask, Pred);
}
-bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
+bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); }
extern "C" {
int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
- return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
+ return utils::shuffleDown(lanes::All, Val, Delta, SrcLane);
}
int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 641be81cca3ed..a9e027727b04b 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -14,198 +14,12 @@
#include "DeviceUtils.h"
#include "Interface.h"
#include "State.h"
+#include "gpuintrin.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
using namespace ompx;
-namespace ompx {
-namespace impl {
-
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_workgroup_size_x();
- case 1:
- return __builtin_amdgcn_workgroup_size_y();
- case 2:
- return __builtin_amdgcn_workgroup_size_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
-
-LaneMaskTy lanemaskLT() {
- uint32_t Lane = mapping::getThreadIdInWarp();
- int64_t Ballot = mapping::activemask();
- uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
- return Mask & Ballot;
-}
-
-LaneMaskTy lanemaskGT() {
- uint32_t Lane = mapping::getThreadIdInWarp();
- if (Lane == (mapping::getWarpSize() - 1))
- return 0;
- int64_t Ballot = mapping::activemask();
- uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
- return Mask & Ballot;
-}
-
-uint32_t getThreadIdInWarp() {
- return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_workitem_id_x();
- case 1:
- return __builtin_amdgcn_workitem_id_y();
- case 2:
- return __builtin_amdgcn_workitem_id_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
- return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
- __builtin_amdgcn_grid_size_z();
-}
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_workgroup_id_x();
- case 1:
- return __builtin_amdgcn_workgroup_id_y();
- case 2:
- return __builtin_amdgcn_workgroup_id_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
- case 1:
- return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
- case 2:
- return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpIdInBlock() {
- return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
- return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
-}
-
-#endif
-///}
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_ntid_x();
- case 1:
- return __nvvm_read_ptx_sreg_ntid_y();
- case 2:
- return __nvvm_read_ptx_sreg_ntid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
-
-LaneMaskTy activemask() { return __nvvm_activemask(); }
-
-LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
-
-LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_tid_x();
- case 1:
- return __nvvm_read_ptx_sreg_tid_y();
- case 2:
- return __nvvm_read_ptx_sreg_tid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_ctaid_x();
- case 1:
- return __nvvm_read_ptx_sreg_ctaid_y();
- case 2:
- return __nvvm_read_ptx_sreg_ctaid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_nctaid_x();
- case 1:
- return __nvvm_read_ptx_sreg_nctaid_y();
- case 2:
- return __nvvm_read_ptx_sreg_nctaid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
- return impl::getNumberOfThreadsInBlock(0) *
- impl::getNumberOfBlocksInKernel(0) *
- impl::getNumberOfThreadsInBlock(1) *
- impl::getNumberOfBlocksInKernel(1) *
- impl::getNumberOfThreadsInBlock(2) *
- impl::getNumberOfBlocksInKernel(2);
-}
-
-uint32_t getWarpIdInBlock() {
- return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
- return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
- mapping::getWarpSize();
-}
-
-#endif
-///}
-
-} // namespace impl
-} // namespace ompx
-
-/// We have to be deliberate about the distinction of `mapping::` and `impl::`
-/// below to avoid repeating assumptions or including irrelevant ones.
-///{
-
static bool isInLastWarp() {
uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
~(mapping::getWarpSize() - 1);
@@ -236,64 +50,87 @@ bool mapping::isLeaderInWarp() {
return utils::popc(Active & LaneMaskLT) == 0;
}
-LaneMaskTy mapping::activemask() { return impl::activemask(); }
+LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
-LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
+LaneMaskTy mapping::lanemaskLT() {
+#ifdef __NVPTX__
+ return __nvvm_read_ptx_sreg_lanemask_lt();
+#else
+ uint32_t Lane = mapping::getThreadIdInWarp();
+ int64_t Ballot = mapping::activemask();
+ uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
+ return Mask & Ballot;
+#endif
+}
-LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
+LaneMaskTy mapping::lanemaskGT() {
+#ifdef __NVPTX__
+ return __nvvm_read_ptx_sreg_lanemask_gt();
+#else
+ uint32_t Lane = mapping::getThreadIdInWarp();
+ if (Lane == (mapping::getWarpSize() - 1))
+ return 0;
+ int64_t Ballot = mapping::activemask();
+ uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
+ return Mask & Ballot;
+#endif
+}
uint32_t mapping::getThreadIdInWarp() {
- uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
- ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
+ uint32_t ThreadIdInWarp = __gpu_lane_id();
+ ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr);
return ThreadIdInWarp;
}
uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
- uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
+ uint32_t ThreadIdInBlock = __gpu_thread_id(Dim);
return ThreadIdInBlock;
}
-uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
+uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
// If we are in SPMD mode, remove one warp.
- return BlockSize - (!IsSPMD * impl::getWarpSize());
+ return BlockSize - (!IsSPMD * mapping::getWarpSize());
}
uint32_t mapping::getMaxTeamThreads() {
return mapping::getMaxTeamThreads(mapping::isSPMDMode());
}
uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
- return impl::getNumberOfThreadsInBlock(Dim);
+ return __gpu_num_threads(Dim);
}
uint32_t mapping::getNumberOfThreadsInKernel() {
- return impl::getNumberOfThreadsInKernel();
+ return mapping::getNumberOfThreadsInBlock(0) *
+ mapping::getNumberOfBlocksInKernel(0) *
+ mapping::getNumberOfThreadsInBlock(1) *
+ mapping::getNumberOfBlocksInKernel(1) *
+ mapping::getNumberOfThreadsInBlock(2) *
+ mapping::getNumberOfBlocksInKernel(2);
}
uint32_t mapping::getWarpIdInBlock() {
- uint32_t WarpID = impl::getWarpIdInBlock();
- ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
+ uint32_t WarpID =
+ mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
+ ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr);
return WarpID;
}
uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
- uint32_t BlockId = impl::getBlockIdInKernel(Dim);
- ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
+ uint32_t BlockId = __gpu_block_id(Dim);
+ ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr);
return BlockId;
}
uint32_t mapping::getNumberOfWarpsInBlock() {
- uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
- ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
- return NumberOfWarpsInBlocks;
+ return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
+ mapping::getWarpSize();
}
uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
- uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
- ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
- return NumberOfBlocks;
+ return __gpu_num_blocks(Dim);
}
uint32_t mapping::getNumberOfProcessorElements() {
@@ -326,11 +163,11 @@ extern "C" {
}
[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
- return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
+ return mapping::getNumberOfThreadsInBlock(mapping::DIM_X);
}
[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
- return impl::getWarpSize();
+ return mapping::getWarpSize();
}
}
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
index 734e937f03920..a89f8b2a74531 100644
--- a/offload/DeviceRTL/src/Misc.cpp
+++ b/offload/DeviceRTL/src/Misc.cpp
@@ -20,41 +20,6 @@
namespace ompx {
namespace impl {
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-double getWTick() {
- // The number of ticks per second for the AMDGPU clock varies by card and can
- // only be retrieved by querying the driver. We rely on the device environment
- // to inform us what the proper frequency is.
- return 1.0 / config::getClockFrequency();
-}
-
-double getWTime() {
- return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
-}
-
-#endif
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-double getWTick() {
- // Timer precision is 1ns
- return ((double)1E-9);
-}
-
-double getWTime() {
- uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer();
- return static_cast<double>(nsecs) * getWTick();
-}
-
-#endif
-
/// Lookup a device-side function using a host pointer /p HstPtr using the table
/// provided by the device plugin. The table is an ordered pair of host and
/// device pointers sorted on the value of the host pointer.
@@ -112,9 +77,17 @@ int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }
int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
-double omp_get_wtick(void) { return ompx::impl::getWTick(); }
+double omp_get_wtick(void) {
+ // The number of ticks per second for the AMDGPU clock varies by card and can
+ // only be retrieved by querying the driver. We rely on the device environment
+ // to inform us what the proper frequency is. NVPTX uses a nanosecond
+ // resolution, we could omit the global read but this makes it consistent.
+ return 1.0 / ompx::config::getClockFrequency();
+}
-double omp_get_wtime(void) { return ompx::impl::getWTime(); }
+double omp_get_wtime(void) {
+ return static_cast<double>(__builtin_readsteadycounter()) * omp_get_wtick();
+}
void *__llvm_omp_indirect_call_lookup(void *HstPtr) {
return ompx::impl::indirectCallLookup(HstPtr);
``````````
</details>
https://github.com/llvm/llvm-project/pull/131644
More information about the llvm-commits
mailing list