[llvm] [OpenMP] Replace most GPU helpers with ones from <gpuintrin.h> (PR #125771)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 4 14:18:28 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-offload
Author: Joseph Huber (jhuber6)
<details>
<summary>Changes</summary>
Summary:
This patch cleans up the runtime by using the definitions from
`<gpuintrin.h>` instead. This reduces complexity and makes it easier to
port. I have left a handful leftover, atomicInc, shuffle, and the sleep
calls. These are not easily replaced but I will work on it.
---
Patch is 38.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125771.diff
23 Files Affected:
- (modified) offload/DeviceRTL/CMakeLists.txt (+7-11)
- (modified) offload/DeviceRTL/include/Allocator.h (-4)
- (modified) offload/DeviceRTL/include/DeviceTypes.h (+1-21)
- (modified) offload/DeviceRTL/include/DeviceUtils.h (-4)
- (modified) offload/DeviceRTL/include/Mapping.h (-4)
- (modified) offload/DeviceRTL/include/State.h (+2-8)
- (modified) offload/DeviceRTL/include/Synchronization.h (-4)
- (modified) offload/DeviceRTL/include/Workshare.h (-4)
- (modified) offload/DeviceRTL/src/Allocator.cpp (-4)
- (modified) offload/DeviceRTL/src/Configuration.cpp (+2-6)
- (modified) offload/DeviceRTL/src/Debug.cpp (-4)
- (modified) offload/DeviceRTL/src/DeviceUtils.cpp (+9-35)
- (modified) offload/DeviceRTL/src/Kernel.cpp (-4)
- (modified) offload/DeviceRTL/src/LibC.cpp (-4)
- (modified) offload/DeviceRTL/src/Mapping.cpp (+40-235)
- (modified) offload/DeviceRTL/src/Misc.cpp (+6-20)
- (modified) offload/DeviceRTL/src/Parallelism.cpp (-4)
- (modified) offload/DeviceRTL/src/Profiling.cpp (-4)
- (modified) offload/DeviceRTL/src/Reduction.cpp (+2-6)
- (modified) offload/DeviceRTL/src/State.cpp (+16-18)
- (modified) offload/DeviceRTL/src/Synchronization.cpp (+5-33)
- (modified) offload/DeviceRTL/src/Tasking.cpp (+1-5)
- (modified) offload/DeviceRTL/src/Workshare.cpp (+3-6)
``````````diff
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 099634e211e7a72..8f2a1fd01fabcc8 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -95,11 +95,10 @@ set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I")
# Set flags for LLVM Bitcode compilation.
-set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
- ${clang_opt_flags} --offload-device-only
- -nocudalib -nogpulib -nogpuinc -nostdlibinc
- -fopenmp -fopenmp-cuda-mode
- -Wno-unknown-cuda-version -Wno-openmp-target
+set(bc_flags -c -flto -std=c++17 -fvisibility=hidden
+ ${clang_opt_flags} -nogpulib -nostdlibinc
+ -fno-rtti -fno-exceptions -fconvergent-functions
+ -Wno-unknown-cuda-version
-DOMPTARGET_DEVICE_RUNTIME
-I${include_directory}
-I${devicertl_base_directory}/../include
@@ -123,8 +122,7 @@ function(compileDeviceRTLLibrary target_name target_triple)
add_custom_command(OUTPUT ${outfile}
COMMAND ${CLANG_TOOL}
${bc_flags}
- -fopenmp-targets=${target_triple}
- -Xopenmp-target=${target_triple} -march=
+ --target=${target_triple}
${target_bc_flags}
-MD -MF ${depfile}
${infile} -o ${outfile}
@@ -242,10 +240,8 @@ function(compileDeviceRTLLibrary target_name target_triple)
set(ide_target_name omptarget-ide-${target_name})
add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
target_compile_options(${ide_target_name} PRIVATE
- -fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march=
- -fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable
- -foffload-lto -fvisibility=hidden --offload-device-only
- -nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version
+ -fvisibility=hidden --target=${target_triple}
+ -nogpulib -nostdlibinc -Wno-unknown-cuda-version
)
target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512)
target_include_directories(${ide_target_name} PRIVATE
diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h
index 475f6a21bb47ebf..79c69a2a96b4e98 100644
--- a/offload/DeviceRTL/include/Allocator.h
+++ b/offload/DeviceRTL/include/Allocator.h
@@ -17,8 +17,6 @@
// Forward declaration.
struct KernelEnvironmentTy;
-#pragma omp begin declare target device_type(nohost)
-
namespace ompx {
namespace allocator {
@@ -44,6 +42,4 @@ extern "C" {
[[gnu::weak]] void free(void *Ptr);
}
-#pragma omp end declare target
-
#endif
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index 1cd044f432e5692..395d72eafbf4054 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -12,6 +12,7 @@
#ifndef OMPTARGET_TYPES_H
#define OMPTARGET_TYPES_H
+#include <gpuintrin.h>
#include <stddef.h>
#include <stdint.h>
@@ -99,14 +100,7 @@ struct TaskDescriptorTy {
TaskFnTy TaskFn;
};
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
using LaneMaskTy = uint64_t;
-#pragma omp end declare variant
-
-#pragma omp begin declare variant match( \
- device = {arch(amdgcn)}, implementation = {extension(match_none)})
-using LaneMaskTy = uint64_t;
-#pragma omp end declare variant
namespace lanes {
enum : LaneMaskTy { All = ~(LaneMaskTy)0 };
@@ -162,20 +156,6 @@ typedef enum omp_allocator_handle_t {
#define __PRAGMA(STR) _Pragma(#STR)
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
-#define SHARED(NAME) \
- NAME [[clang::loader_uninitialized]]; \
- OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
-
-// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
-// now that's not the case.
-#define THREAD_LOCAL(NAME) \
- [[clang::address_space(5)]] NAME [[clang::loader_uninitialized]]
-
-// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
-// does?
-#define CONSTANT(NAME) \
- [[clang::address_space(4)]] NAME [[clang::loader_uninitialized]]
-
///}
#endif
diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h
index 2243673aef61c78..b92514ee9838a14 100644
--- a/offload/DeviceRTL/include/DeviceUtils.h
+++ b/offload/DeviceRTL/include/DeviceUtils.h
@@ -15,8 +15,6 @@
#include "DeviceTypes.h"
#include "Shared/Utils.h"
-#pragma omp begin declare target device_type(nohost)
-
namespace utils {
template <typename T> struct type_identity {
@@ -95,6 +93,4 @@ bool isThreadLocalMemPtr(void *Ptr);
} // namespace utils
-#pragma omp end declare target
-
#endif
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index 2217eb7616b3862..f892a025159d482 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -24,12 +24,8 @@ enum {
DIM_Z = 2,
};
-#pragma omp begin declare target device_type(nohost)
-
inline constexpr uint32_t MaxThreadsPerTeam = 1024;
-#pragma omp end declare target
-
/// Initialize the mapping machinery.
void init(bool IsSPMD);
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index f0500c1083d7f44..58b619ff1072aef 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -22,8 +22,6 @@
// Forward declaration.
struct KernelEnvironmentTy;
-#pragma omp begin declare target device_type(nohost)
-
namespace ompx {
namespace memory {
@@ -88,8 +86,7 @@ struct TeamStateTy {
ParallelRegionFnTy ParallelRegionFnVar;
};
-extern TeamStateTy TeamState;
-#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc)
+extern TeamStateTy [[clang::address_space(3)]] TeamState;
struct ThreadStateTy {
@@ -115,8 +112,7 @@ struct ThreadStateTy {
}
};
-extern ThreadStateTy **ThreadStates;
-#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
+extern ThreadStateTy **[[clang::address_space(3)]] ThreadStates;
/// Initialize the state machinery. Must be called by all threads.
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
@@ -378,6 +374,4 @@ inline state::Value<uint32_t, state::VK_RunSched> RunSched;
} // namespace ompx
-#pragma omp end declare target
-
#endif
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
index 5045d3c2c99a336..f9eb8d0d2319837 100644
--- a/offload/DeviceRTL/include/Synchronization.h
+++ b/offload/DeviceRTL/include/Synchronization.h
@@ -15,8 +15,6 @@
#include "DeviceTypes.h"
#include "DeviceUtils.h"
-#pragma omp begin declare target device_type(nohost)
-
namespace ompx {
namespace atomic {
@@ -220,6 +218,4 @@ void system(atomic::OrderingTy Ordering);
} // namespace ompx
-#pragma omp end declare target
-
#endif
diff --git a/offload/DeviceRTL/include/Workshare.h b/offload/DeviceRTL/include/Workshare.h
index fa9b3b2430b8c4a..554c3271c334c05 100644
--- a/offload/DeviceRTL/include/Workshare.h
+++ b/offload/DeviceRTL/include/Workshare.h
@@ -12,8 +12,6 @@
#ifndef OMPTARGET_WORKSHARE_H
#define OMPTARGET_WORKSHARE_H
-#pragma omp begin declare target device_type(nohost)
-
namespace ompx {
namespace workshare {
@@ -25,6 +23,4 @@ void init(bool IsSPMD);
} // namespace ompx
-#pragma omp end declare target
-
#endif
diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp
index ac662c48d4f5fb4..aac2a6005158efa 100644
--- a/offload/DeviceRTL/src/Allocator.cpp
+++ b/offload/DeviceRTL/src/Allocator.cpp
@@ -19,8 +19,6 @@
using namespace ompx;
-#pragma omp begin declare target device_type(nohost)
-
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility(
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
@@ -77,5 +75,3 @@ void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
///}
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
index 0b488b8034178d7..796e9ee254f3ac8 100644
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ b/offload/DeviceRTL/src/Configuration.cpp
@@ -17,8 +17,6 @@
using namespace ompx;
-#pragma omp begin declare target device_type(nohost)
-
// Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled.
[[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0;
[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0;
@@ -30,8 +28,8 @@ using namespace ompx;
// This variable should be visible to the plugin so we override the default
// hidden visibility.
[[gnu::used, gnu::retain, gnu::weak,
- gnu::visibility("protected")]] DeviceEnvironmentTy
- CONSTANT(__omp_rtl_device_environment);
+ gnu::visibility("protected")]] DeviceEnvironmentTy __gpu_constant
+ __omp_rtl_device_environment;
uint32_t config::getAssumeTeamsOversubscription() {
return __omp_rtl_assume_teams_oversubscription;
@@ -85,5 +83,3 @@ bool config::mayUseNestedParallelism() {
return false;
return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
}
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
index 1d9c9628854222b..5b5482d766b1d0d 100644
--- a/offload/DeviceRTL/src/Debug.cpp
+++ b/offload/DeviceRTL/src/Debug.cpp
@@ -21,8 +21,6 @@
using namespace ompx;
-#pragma omp begin declare target device_type(nohost)
-
extern "C" {
void __assert_assume(bool condition) { __builtin_assume(condition); }
@@ -44,5 +42,3 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
__builtin_trap();
}
}
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
index c204a7be73b1fc0..50022873a65b145 100644
--- a/offload/DeviceRTL/src/DeviceUtils.cpp
+++ b/offload/DeviceRTL/src/DeviceUtils.cpp
@@ -15,14 +15,12 @@
#include "Interface.h"
#include "Mapping.h"
-#pragma omp begin declare target device_type(nohost)
+#include <gpuintrin.h>
using namespace ompx;
namespace impl {
-bool isSharedMemPtr(const void *Ptr) { return false; }
-
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
static_assert(sizeof(unsigned long) == 8, "");
*LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
@@ -33,17 +31,12 @@ uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
}
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
- int32_t Width);
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred);
-
/// AMDGCN Implementation
///
///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
+#ifdef __AMDGPU__
+// TODO: Move this to <gpuintrin.h>.
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
int Self = mapping::getThreadIdInWarp();
int Index = SrcLane + (Self & ~(Width - 1));
@@ -57,25 +50,15 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
- return Mask & __builtin_amdgcn_ballot_w64(Pred);
-}
-
-bool isSharedMemPtr(const void *Ptr) {
- return __builtin_amdgcn_is_shared(
- (const __attribute__((address_space(0))) void *)Ptr);
-}
-#pragma omp end declare variant
+#endif
///}
/// NVPTX Implementation
///
///{
-#pragma omp begin declare variant match( \
- device = {arch(nvptx, nvptx64)}, \
- implementation = {extension(match_any)})
+#ifdef __NVPTX__
+// TODO: Move this to <gpuintrin.h>.
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
}
@@ -84,14 +67,7 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
}
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
- return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
-}
-
-bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
-
-#pragma omp end declare variant
+#endif
///}
} // namespace impl
@@ -123,10 +99,10 @@ int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
}
uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
- return impl::ballotSync(Mask, Pred);
+ return __gpu_ballot(Mask, Pred);
}
-bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
+bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); }
extern "C" {
int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
@@ -137,5 +113,3 @@ int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
return utils::shuffleDown(lanes::All, Val, Delta, Width);
}
}
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 8bb275eae776c6a..9bb89573dc0cb85 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -25,8 +25,6 @@
using namespace ompx;
-#pragma omp begin declare target device_type(nohost)
-
static void
inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
@@ -155,5 +153,3 @@ void __kmpc_target_deinit() {
int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
}
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp
index e55008f46269fe8..83f9233d9480325 100644
--- a/offload/DeviceRTL/src/LibC.cpp
+++ b/offload/DeviceRTL/src/LibC.cpp
@@ -8,8 +8,6 @@
#include "LibC.h"
-#pragma omp begin declare target device_type(nohost)
-
#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; }
#else
@@ -48,5 +46,3 @@ namespace ompx {
return ::vprintf(Format, vlist);
}
} // namespace ompx
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 8583a539824c82a..8929692114e61e5 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -15,213 +15,12 @@
#include "Interface.h"
#include "State.h"
-#pragma omp begin declare target device_type(nohost)
+#include <gpuintrin.h>
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
using namespace ompx;
-namespace ompx {
-namespace impl {
-
-// Forward declarations defined to be defined for AMDGCN and NVPTX.
-LaneMaskTy activemask();
-LaneMaskTy lanemaskLT();
-LaneMaskTy lanemaskGT();
-uint32_t getThreadIdInWarp();
-uint32_t getThreadIdInBlock(int32_t Dim);
-uint32_t getNumberOfThreadsInBlock(int32_t Dim);
-uint32_t getNumberOfThreadsInKernel();
-uint32_t getBlockIdInKernel(int32_t Dim);
-uint32_t getNumberOfBlocksInKernel(int32_t Dim);
-uint32_t getWarpIdInBlock();
-uint32_t getNumberOfWarpsInBlock();
-uint32_t getWarpSize();
-
-/// AMDGCN Implementation
-///
-///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
-
-uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_workgroup_size_x();
- case 1:
- return __builtin_amdgcn_workgroup_size_y();
- case 2:
- return __builtin_amdgcn_workgroup_size_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
-
-LaneMaskTy lanemaskLT() {
- uint32_t Lane = mapping::getThreadIdInWarp();
- int64_t Ballot = mapping::activemask();
- uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
- return Mask & Ballot;
-}
-
-LaneMaskTy lanemaskGT() {
- uint32_t Lane = mapping::getThreadIdInWarp();
- if (Lane == (mapping::getWarpSize() - 1))
- return 0;
- int64_t Ballot = mapping::activemask();
- uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
- return Mask & Ballot;
-}
-
-uint32_t getThreadIdInWarp() {
- return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_workitem_id_x();
- case 1:
- return __builtin_amdgcn_workitem_id_y();
- case 2:
- return __builtin_amdgcn_workitem_id_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
- return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
- __builtin_amdgcn_grid_size_z();
-}
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_workgroup_id_x();
- case 1:
- return __builtin_amdgcn_workgroup_id_y();
- case 2:
- return __builtin_amdgcn_workgroup_id_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
- case 1:
- return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
- case 2:
- return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpIdInBlock() {
- return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
- return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
-}
-
-#pragma omp end declare variant
-///}
-
-/// NVPTX Implementation
-///
-///{
-#pragma omp begin declare variant match( \
- device = {arch(nvptx, nvptx64)}, \
- implementation = {extension(match_any)})
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_ntid_x();
- case 1:
- return __nvvm_read_ptx_sreg_ntid_y();
- case 2:
- return __nvvm_read_ptx_sreg_ntid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
-
-LaneMaskTy activemask() { return __nvvm_activemask(); }
-
-LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
-
-LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_tid_x();
- case 1:
- return __nvvm_read_ptx_sreg_tid_y();
- case 2:
- return __nvvm_read_ptx_sreg_tid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_ctaid_x();
- case 1:
- return __nvvm_read_ptx_sreg_ctaid_y();
- case 2:
- return __nvvm_read_ptx_sreg_ctaid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_nctaid_x();
- case 1:
- return __nvvm_read_ptx_sreg_nctaid_y();
- case 2:
- return __nvvm_read_ptx_sreg_nctaid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
- return impl::getNumberOfThreadsInBlock(0) *
- impl::getNumberOfBlocksInKernel(0) *
- impl::getNumberOfThreadsInBlock(1) *
- impl::getNumberOfBlocksInKernel(1) *
- impl::getNumberOfThreadsInBlock(2) *
- impl::getNumberOfBlocksInKernel(2);
-}
-
-uint32_t getWarpIdInBlock() {
- return impl::get...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/125771
More information about the llvm-commits
mailing list