[Openmp-commits] [openmp] 1f3a28d - [OpenMP][NFC] Reorganize the ompx::mapping layer in the GPU runtime
Johannes Doerfert via Openmp-commits
openmp-commits at lists.llvm.org
Mon Jul 31 13:45:30 PDT 2023
Author: Johannes Doerfert
Date: 2023-07-31T13:44:51-07:00
New Revision: 1f3a28d4e54649d1453eb951f570a8c1958d4a5c
URL: https://github.com/llvm/llvm-project/commit/1f3a28d4e54649d1453eb951f570a8c1958d4a5c
DIFF: https://github.com/llvm/llvm-project/commit/1f3a28d4e54649d1453eb951f570a8c1958d4a5c.diff
LOG: [OpenMP][NFC] Reorganize the ompx::mapping layer in the GPU runtime
This change makes the naming more consistent, I hope.
Added:
Modified:
openmp/libomptarget/DeviceRTL/include/Debug.h
openmp/libomptarget/DeviceRTL/include/Mapping.h
openmp/libomptarget/DeviceRTL/src/Debug.cpp
openmp/libomptarget/DeviceRTL/src/Kernel.cpp
openmp/libomptarget/DeviceRTL/src/Mapping.cpp
openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
openmp/libomptarget/DeviceRTL/src/Reduction.cpp
openmp/libomptarget/DeviceRTL/src/State.cpp
openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/DeviceRTL/include/Debug.h b/openmp/libomptarget/DeviceRTL/include/Debug.h
index 29e7e5bb347d603..1ea129a97a17e89 100644
--- a/openmp/libomptarget/DeviceRTL/include/Debug.h
+++ b/openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -31,6 +31,9 @@ void __assert_fail(const char *expr, const char *msg, const char *file,
else \
__assert_assume(expr); \
}
+#define UNREACHABLE(msg) \
+ PRINT(msg); \
+ __builtin_trap();
///}
diff --git a/openmp/libomptarget/DeviceRTL/include/Mapping.h b/openmp/libomptarget/DeviceRTL/include/Mapping.h
index 96314e4702f6de9..165904644dbb980 100644
--- a/openmp/libomptarget/DeviceRTL/include/Mapping.h
+++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h
@@ -18,6 +18,12 @@ namespace ompx {
namespace mapping {
+enum {
+ DIM_X = 0,
+ DIM_Y = 1,
+ DIM_Z = 2,
+};
+
#pragma omp begin declare target device_type(nohost)
inline constexpr uint32_t MaxThreadsPerTeam = 1024;
@@ -63,34 +69,38 @@ LaneMaskTy lanemaskGT();
/// Return the thread Id in the warp, in [0, getWarpSize()).
uint32_t getThreadIdInWarp();
-/// Return the thread Id in the block, in [0, getBlockSize()).
-uint32_t getThreadIdInBlock();
-
-/// Return the warp id in the block.
-uint32_t getWarpId();
-
/// Return the warp size, thus number of threads in the warp.
uint32_t getWarpSize();
+/// Return the warp id in the block, in [0, getNumberOfWarpsInBlock()]
+uint32_t getWarpIdInBlock();
+
/// Return the number of warps in the block.
uint32_t getNumberOfWarpsInBlock();
-/// Return the block Id in the kernel, in [0, getKernelSize()).
-uint32_t getBlockId();
+/// Return the thread Id in the block, in [0, getNumberOfThreadsInBlock(Dim)).
+uint32_t getThreadIdInBlock(int32_t Dim = DIM_X);
/// Return the block size, thus number of threads in the block.
-///
-/// Note: The version taking \p IsSPMD mode explicitly can be used during the
-/// initialization of the target region, that is before `mapping::isSPMDMode()`
-/// can be called by any thread other than the main one.
-uint32_t getBlockSize();
-uint32_t getBlockSize(bool IsSPMD);
+uint32_t getNumberOfThreadsInBlock(int32_t Dim = DIM_X);
+
+/// Return the block Id in the kernel, in [0, getNumberOfBlocksInKernel(Dim)).
+uint32_t getBlockIdInKernel(int32_t Dim = DIM_X);
/// Return the number of blocks in the kernel.
-uint32_t getNumberOfBlocks();
+uint32_t getNumberOfBlocksInKernel(int32_t Dim = DIM_X);
/// Return the kernel size, thus number of threads in the kernel.
-uint32_t getKernelSize();
+uint32_t getNumberOfThreadsInKernel();
+
+/// Return the maximal number of threads in the block usable for a team (=
+/// parallel region).
+///
+/// Note: The version taking \p IsSPMD mode explicitly can be used during the
+/// initialization of the target region, that is before `mapping::isSPMDMode()`
+/// can be called by any thread other than the main one.
+uint32_t getMaxTeamThreads();
+uint32_t getMaxTeamThreads(bool IsSPMD);
/// Return the number of processing elements on the device.
uint32_t getNumberOfProcessorElements();
diff --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
index 873dccb929ea667..6e296b0a277caec 100644
--- a/openmp/libomptarget/DeviceRTL/src/Debug.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -40,7 +40,8 @@ void __assert_fail(const char *expr, const char *msg, const char *file,
DebugEntryRAII::DebugEntryRAII(const char *File, const unsigned Line,
const char *Function) {
if (config::isDebugMode(config::DebugKind::FunctionTracing) &&
- mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) {
+ mapping::getThreadIdInBlock() == 0 &&
+ mapping::getBlockIdInKernel() == 0) {
uint16_t &Level =
state::getKernelEnvironment().DynamicEnv->DebugIndentionLevel;
@@ -56,7 +57,8 @@ DebugEntryRAII::DebugEntryRAII(const char *File, const unsigned Line,
DebugEntryRAII::~DebugEntryRAII() {
if (config::isDebugMode(config::DebugKind::FunctionTracing) &&
- mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) {
+ mapping::getThreadIdInBlock() == 0 &&
+ mapping::getBlockIdInKernel() == 0) {
uint16_t &Level =
state::getKernelEnvironment().DynamicEnv->DebugIndentionLevel;
Level--;
diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
index bc1d8d5b3d33f0e..e8f6cfdc6ea5405 100644
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -107,10 +107,11 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment) {
// reaches its corresponding synchronize::threads call: that would permit all
// active worker threads to proceed before the main thread has actually set
// state::ParallelRegionFn, and then they would immediately quit without
- // doing any work. mapping::getBlockSize() does not include any of the main
- // thread's warp, so none of its threads can ever be active worker threads.
+ // doing any work. mapping::getMaxTeamThreads() does not include any of the
+ // main thread's warp, so none of its threads can ever be active worker
+ // threads.
if (UseGenericStateMachine &&
- mapping::getThreadIdInBlock() < mapping::getBlockSize(IsSPMD)) {
+ mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD)) {
genericStateMachine(KernelEnvironment.Ident);
} else {
// Retrieve the work function just to ensure we always call
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
index 78361284ff8de2b..8f26af086e714de 100644
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -26,16 +26,16 @@ namespace impl {
// Forward declarations defined to be defined for AMDGCN and NVPTX.
const llvm::omp::GV &getGridValue();
-uint32_t getNumHardwareThreadsInBlock();
LaneMaskTy activemask();
LaneMaskTy lanemaskLT();
LaneMaskTy lanemaskGT();
uint32_t getThreadIdInWarp();
-uint32_t getThreadIdInBlock();
-uint32_t getKernelSize();
-uint32_t getBlockId();
-uint32_t getNumberOfBlocks();
-uint32_t getWarpId();
+uint32_t getThreadIdInBlock(int32_t Dim);
+uint32_t getNumberOfThreadsInBlock(int32_t Dim);
+uint32_t getNumberOfThreadsInKernel();
+uint32_t getBlockIdInKernel(int32_t Dim);
+uint32_t getNumberOfBlocksInKernel(int32_t Dim);
+uint32_t getWarpIdInBlock();
uint32_t getNumberOfWarpsInBlock();
/// AMDGCN Implementation
@@ -47,8 +47,16 @@ const llvm::omp::GV &getGridValue() {
return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
}
-uint32_t getNumHardwareThreadsInBlock() {
- return __builtin_amdgcn_workgroup_size_x();
+uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
+ switch (Dim) {
+ case 0:
+ return __builtin_amdgcn_workgroup_size_x();
+ case 1:
+ return __builtin_amdgcn_workgroup_size_y();
+ case 2:
+ return __builtin_amdgcn_workgroup_size_z();
+ };
+ UNREACHABLE("Dim outside range!");
}
LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
@@ -73,22 +81,53 @@ uint32_t getThreadIdInWarp() {
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
}
-uint32_t getThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); }
+uint32_t getThreadIdInBlock(int32_t Dim) {
+ switch (Dim) {
+ case 0:
+ return __builtin_amdgcn_workitem_id_x();
+ case 1:
+ return __builtin_amdgcn_workitem_id_y();
+ case 2:
+ return __builtin_amdgcn_workitem_id_z();
+ };
+ UNREACHABLE("Dim outside range!");
+}
-uint32_t getKernelSize() { return __builtin_amdgcn_grid_size_x(); }
+uint32_t getNumberOfThreadsInKernel() {
+ return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
+ __builtin_amdgcn_grid_size_z();
+}
-uint32_t getBlockId() { return __builtin_amdgcn_workgroup_id_x(); }
+uint32_t getBlockIdInKernel(int32_t Dim) {
+ switch (Dim) {
+ case 0:
+ return __builtin_amdgcn_workgroup_id_x();
+ case 1:
+ return __builtin_amdgcn_workgroup_id_y();
+ case 2:
+ return __builtin_amdgcn_workgroup_id_z();
+ };
+ UNREACHABLE("Dim outside range!");
+}
-uint32_t getNumberOfBlocks() {
- return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
+ switch (Dim) {
+ case 0:
+ return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+ case 1:
+ return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+ case 2:
+ return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+ };
+ UNREACHABLE("Dim outside range!");
}
-uint32_t getWarpId() {
- return impl::getThreadIdInBlock() / mapping::getWarpSize();
+uint32_t getWarpIdInBlock() {
+ return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
}
uint32_t getNumberOfWarpsInBlock() {
- return mapping::getBlockSize() / mapping::getWarpSize();
+ return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
}
#pragma omp end declare variant
@@ -101,8 +140,16 @@ uint32_t getNumberOfWarpsInBlock() {
device = {arch(nvptx, nvptx64)}, \
implementation = {extension(match_any)})
-uint32_t getNumHardwareThreadsInBlock() {
- return __nvvm_read_ptx_sreg_ntid_x();
+uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
+ switch (Dim) {
+ case 0:
+ return __nvvm_read_ptx_sreg_ntid_x();
+ case 1:
+ return __nvvm_read_ptx_sreg_ntid_y();
+ case 2:
+ return __nvvm_read_ptx_sreg_ntid_z();
+ };
+ UNREACHABLE("Dim outside range!");
}
const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
@@ -125,27 +172,62 @@ LaneMaskTy lanemaskGT() {
return Res;
}
-uint32_t getThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
+uint32_t getThreadIdInBlock(int32_t Dim) {
+ switch (Dim) {
+ case 0:
+ return __nvvm_read_ptx_sreg_tid_x();
+ case 1:
+ return __nvvm_read_ptx_sreg_tid_y();
+ case 2:
+ return __nvvm_read_ptx_sreg_tid_z();
+ };
+ UNREACHABLE("Dim outside range!");
+}
uint32_t getThreadIdInWarp() {
- return impl::getThreadIdInBlock() & (mapping::getWarpSize() - 1);
+ return impl::getThreadIdInBlock(mapping::DIM_X) &
+ (mapping::getWarpSize() - 1);
}
-uint32_t getKernelSize() {
- return __nvvm_read_ptx_sreg_nctaid_x() *
- mapping::getNumberOfProcessorElements();
+uint32_t getBlockIdInKernel(int32_t Dim) {
+ switch (Dim) {
+ case 0:
+ return __nvvm_read_ptx_sreg_ctaid_x();
+ case 1:
+ return __nvvm_read_ptx_sreg_ctaid_y();
+ case 2:
+ return __nvvm_read_ptx_sreg_ctaid_z();
+ };
+ UNREACHABLE("Dim outside range!");
}
-uint32_t getBlockId() { return __nvvm_read_ptx_sreg_ctaid_x(); }
+uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
+ switch (Dim) {
+ case 0:
+ return __nvvm_read_ptx_sreg_nctaid_x();
+ case 1:
+ return __nvvm_read_ptx_sreg_nctaid_y();
+ case 2:
+ return __nvvm_read_ptx_sreg_nctaid_z();
+ };
+ UNREACHABLE("Dim outside range!");
+}
-uint32_t getNumberOfBlocks() { return __nvvm_read_ptx_sreg_nctaid_x(); }
+uint32_t getNumberOfThreadsInKernel() {
+ return impl::getNumberOfThreadsInBlock(0) *
+ impl::getNumberOfBlocksInKernel(0) *
+ impl::getNumberOfThreadsInBlock(1) *
+ impl::getNumberOfBlocksInKernel(1) *
+ impl::getNumberOfThreadsInBlock(2) *
+ impl::getNumberOfBlocksInKernel(2);
+}
-uint32_t getWarpId() {
- return impl::getThreadIdInBlock() / mapping::getWarpSize();
+uint32_t getWarpIdInBlock() {
+ return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
}
uint32_t getNumberOfWarpsInBlock() {
- return (mapping::getBlockSize() + mapping::getWarpSize() - 1) /
+ return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
mapping::getWarpSize();
}
@@ -162,7 +244,7 @@ uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
///{
static bool isInLastWarp() {
- uint32_t MainTId = (mapping::getNumberOfProcessorElements() - 1) &
+ uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
~(mapping::getWarpSize() - 1);
return mapping::getThreadIdInBlock() == MainTId;
}
@@ -203,53 +285,55 @@ uint32_t mapping::getThreadIdInWarp() {
return ThreadIdInWarp;
}
-uint32_t mapping::getThreadIdInBlock() {
- uint32_t ThreadIdInBlock = impl::getThreadIdInBlock();
+uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
+ uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
return ThreadIdInBlock;
}
uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
-uint32_t mapping::getBlockSize(bool IsSPMD) {
- uint32_t BlockSize =
- mapping::getNumberOfProcessorElements() - (!IsSPMD * impl::getWarpSize());
- return BlockSize;
+uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
+ uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
+ // If we are in SPMD mode, remove one warp.
+ return BlockSize - (!IsSPMD * impl::getWarpSize());
}
-uint32_t mapping::getBlockSize() {
- return mapping::getBlockSize(mapping::isSPMDMode());
+uint32_t mapping::getMaxTeamThreads() {
+ return mapping::getMaxTeamThreads(mapping::isSPMDMode());
}
-uint32_t mapping::getKernelSize() { return impl::getKernelSize(); }
+uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
+ return impl::getNumberOfThreadsInBlock(Dim);
+}
+
+uint32_t mapping::getNumberOfThreadsInKernel() {
+ return impl::getNumberOfThreadsInKernel();
+}
-uint32_t mapping::getWarpId() {
- uint32_t WarpID = impl::getWarpId();
+uint32_t mapping::getWarpIdInBlock() {
+ uint32_t WarpID = impl::getWarpIdInBlock();
ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
return WarpID;
}
-uint32_t mapping::getBlockId() {
- uint32_t BlockId = impl::getBlockId();
- ASSERT(BlockId < impl::getNumberOfBlocks(), nullptr);
+uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
+ uint32_t BlockId = impl::getBlockIdInKernel(Dim);
+ ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
return BlockId;
}
uint32_t mapping::getNumberOfWarpsInBlock() {
uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
- ASSERT(impl::getWarpId() < NumberOfWarpsInBlocks, nullptr);
+ ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
return NumberOfWarpsInBlocks;
}
-uint32_t mapping::getNumberOfBlocks() {
- uint32_t NumberOfBlocks = impl::getNumberOfBlocks();
- ASSERT(impl::getBlockId() < NumberOfBlocks, nullptr);
+uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
+ uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
+ ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
return NumberOfBlocks;
}
-uint32_t mapping::getNumberOfProcessorElements() {
- uint32_t NumberOfProcessorElements = impl::getNumHardwareThreadsInBlock();
- ASSERT(impl::getThreadIdInBlock() < NumberOfProcessorElements, nullptr);
- return NumberOfProcessorElements;
-}
+uint32_t mapping::getNumberOfProcessorElements() { __builtin_trap(); }
///}
@@ -279,7 +363,7 @@ __attribute__((noinline)) uint32_t __kmpc_get_hardware_thread_id_in_block() {
__attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() {
FunctionTracingRAII();
- return impl::getNumHardwareThreadsInBlock();
+ return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
}
__attribute__((noinline)) uint32_t __kmpc_get_warp_size() {
@@ -287,4 +371,5 @@ __attribute__((noinline)) uint32_t __kmpc_get_warp_size() {
return impl::getWarpSize();
}
}
+
#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index 82b944a8bd0a214..84d8f88105f107f 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -49,7 +49,7 @@ namespace {
uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
uint32_t NThreadsICV =
NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
- uint32_t NumThreads = mapping::getBlockSize();
+ uint32_t NumThreads = mapping::getMaxTeamThreads();
if (NThreadsICV != 0 && NThreadsICV < NumThreads)
NumThreads = NThreadsICV;
@@ -110,8 +110,8 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
ASSERT(state::HasThreadState == false, nullptr);
uint32_t NumThreads = determineNumberOfThreads(num_threads);
- uint32_t BlockSize = mapping::getBlockSize();
- uint32_t PTeamSize = NumThreads == BlockSize ? 0 : NumThreads;
+ uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
+ uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
if (mapping::isSPMDMode()) {
// Avoid the race between the read of the `icv::Level` above and the write
// below by synchronizing all threads here.
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index f544928e33c4098..8e0b91bfc1eb14b 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -91,7 +91,7 @@ static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
uint32_t WarpsNeeded =
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
- uint32_t WarpId = mapping::getWarpId();
+ uint32_t WarpId = mapping::getWarpIdInBlock();
// Volta execution model:
// For the Generic execution mode a parallel region either has 1 thread and
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index f824ea2809c2177..c18368da72d6cd3 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -93,7 +93,7 @@ struct SharedMemorySmartStackTy {
private:
/// Compute the size of the storage space reserved for a thread.
uint32_t computeThreadStorageTotal() {
- uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements();
+ uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
Alignment);
}
@@ -269,7 +269,7 @@ void state::enterDataEnvironment(IdentTy *Ident) {
static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
- uint32_t Bytes = sizeof(ThreadStates[0]) * mapping::getBlockSize();
+ uint32_t Bytes = sizeof(ThreadStates[0]) * mapping::getMaxTeamThreads();
void *ThreadStatesPtr =
memory::allocGlobal(Bytes, "Thread state array allocation");
if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
@@ -322,7 +322,7 @@ void state::assumeInitialState(bool IsSPMD) {
int state::getEffectivePTeamSize() {
int PTeamSize = state::ParallelTeamSize;
- return PTeamSize ? PTeamSize : mapping::getBlockSize();
+ return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
}
extern "C" {
@@ -334,7 +334,7 @@ void omp_set_num_threads(int V) { icv::NThreads = V; }
int omp_get_max_threads(void) {
int NT = icv::NThreads;
- return NT > 0 ? NT : mapping::getBlockSize();
+ return NT > 0 ? NT : mapping::getMaxTeamThreads();
}
int omp_get_level(void) {
@@ -373,7 +373,7 @@ int omp_get_num_threads(void) {
return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
}
-int omp_get_thread_limit(void) { return mapping::getBlockSize(); }
+int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
@@ -415,9 +415,9 @@ int omp_get_num_devices(void) { return config::getNumDevices(); }
int omp_get_device_num(void) { return config::getDeviceNum(); }
-int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); }
+int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
-int omp_get_team_num() { return mapping::getBlockId(); }
+int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
int omp_get_initial_device(void) { return -1; }
}
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index 36536b7a81a164d..cb60aab25babfc8 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -385,7 +385,7 @@ void setLock(omp_lock_t *Lock) {
for (;;) {
now = __nvvm_read_ptx_sreg_clock();
int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
- if (cycles >= OMP_SPIN * mapping::getBlockId()) {
+ if (cycles >= OMP_SPIN * mapping::getBlockIdInKernel()) {
break;
}
}
More information about the Openmp-commits
mailing list