[Openmp-commits] [openmp] 1f3a28d - [OpenMP][NFC] Reorganize the ompx::mapping layer in the GPU runtime

Mon Jul 31 13:45:30 PDT 2023

Author: Johannes Doerfert
Date: 2023-07-31T13:44:51-07:00
New Revision: 1f3a28d4e54649d1453eb951f570a8c1958d4a5c

URL: https://github.com/llvm/llvm-project/commit/1f3a28d4e54649d1453eb951f570a8c1958d4a5c
DIFF: https://github.com/llvm/llvm-project/commit/1f3a28d4e54649d1453eb951f570a8c1958d4a5c.diff

LOG: [OpenMP][NFC] Reorganize the ompx::mapping layer in the GPU runtime

This change makes the naming more consistent, I hope.

Added: 
    

Modified: 
    openmp/libomptarget/DeviceRTL/include/Debug.h
    openmp/libomptarget/DeviceRTL/include/Mapping.h
    openmp/libomptarget/DeviceRTL/src/Debug.cpp
    openmp/libomptarget/DeviceRTL/src/Kernel.cpp
    openmp/libomptarget/DeviceRTL/src/Mapping.cpp
    openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
    openmp/libomptarget/DeviceRTL/src/Reduction.cpp
    openmp/libomptarget/DeviceRTL/src/State.cpp
    openmp/libomptarget/DeviceRTL/src/Synchronization.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/DeviceRTL/include/Debug.h b/openmp/libomptarget/DeviceRTL/include/Debug.h
index 29e7e5bb347d603..1ea129a97a17e89 100644

--- a/openmp/libomptarget/DeviceRTL/include/Debug.h
+++ b/openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -31,6 +31,9 @@ void __assert_fail(const char *expr, const char *msg, const char *file,
     else                                                                       \
       __assert_assume(expr);                                                   \
   }
+#define UNREACHABLE(msg)                                                       \
+  PRINT(msg);                                                                  \
+  __builtin_trap();
 
 ///}
 

diff  --git a/openmp/libomptarget/DeviceRTL/include/Mapping.h b/openmp/libomptarget/DeviceRTL/include/Mapping.h
index 96314e4702f6de9..165904644dbb980 100644
--- a/openmp/libomptarget/DeviceRTL/include/Mapping.h
+++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h
@@ -18,6 +18,12 @@ namespace ompx {
 
 namespace mapping {
 
+enum {
+  DIM_X = 0,
+  DIM_Y = 1,
+  DIM_Z = 2,
+};
+
 #pragma omp begin declare target device_type(nohost)
 
 inline constexpr uint32_t MaxThreadsPerTeam = 1024;
@@ -63,34 +69,38 @@ LaneMaskTy lanemaskGT();
 /// Return the thread Id in the warp, in [0, getWarpSize()).
 uint32_t getThreadIdInWarp();
 
-/// Return the thread Id in the block, in [0, getBlockSize()).
-uint32_t getThreadIdInBlock();
-
-/// Return the warp id in the block.
-uint32_t getWarpId();
-
 /// Return the warp size, thus number of threads in the warp.
 uint32_t getWarpSize();
 
+/// Return the warp id in the block, in [0, getNumberOfWarpsInBlock()]
+uint32_t getWarpIdInBlock();
+
 /// Return the number of warps in the block.
 uint32_t getNumberOfWarpsInBlock();
 
-/// Return the block Id in the kernel, in [0, getKernelSize()).
-uint32_t getBlockId();
+/// Return the thread Id in the block, in [0, getNumberOfThreadsInBlock(Dim)).
+uint32_t getThreadIdInBlock(int32_t Dim = DIM_X);
 
 /// Return the block size, thus number of threads in the block.
-///
-/// Note: The version taking \p IsSPMD mode explicitly can be used during the
-/// initialization of the target region, that is before `mapping::isSPMDMode()`
-/// can be called by any thread other than the main one.
-uint32_t getBlockSize();
-uint32_t getBlockSize(bool IsSPMD);
+uint32_t getNumberOfThreadsInBlock(int32_t Dim = DIM_X);
+
+/// Return the block Id in the kernel, in [0, getNumberOfBlocksInKernel(Dim)).
+uint32_t getBlockIdInKernel(int32_t Dim = DIM_X);
 
 /// Return the number of blocks in the kernel.
-uint32_t getNumberOfBlocks();
+uint32_t getNumberOfBlocksInKernel(int32_t Dim = DIM_X);
 
 /// Return the kernel size, thus number of threads in the kernel.
-uint32_t getKernelSize();
+uint32_t getNumberOfThreadsInKernel();
+
+/// Return the maximal number of threads in the block usable for a team (=
+/// parallel region).
+///
+/// Note: The version taking \p IsSPMD mode explicitly can be used during the
+/// initialization of the target region, that is before `mapping::isSPMDMode()`
+/// can be called by any thread other than the main one.
+uint32_t getMaxTeamThreads();
+uint32_t getMaxTeamThreads(bool IsSPMD);
 
 /// Return the number of processing elements on the device.
 uint32_t getNumberOfProcessorElements();

diff  --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
index 873dccb929ea667..6e296b0a277caec 100644
--- a/openmp/libomptarget/DeviceRTL/src/Debug.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -40,7 +40,8 @@ void __assert_fail(const char *expr, const char *msg, const char *file,
 DebugEntryRAII::DebugEntryRAII(const char *File, const unsigned Line,
                                const char *Function) {
   if (config::isDebugMode(config::DebugKind::FunctionTracing) &&
-      mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) {
+      mapping::getThreadIdInBlock() == 0 &&
+      mapping::getBlockIdInKernel() == 0) {
 
     uint16_t &Level =
         state::getKernelEnvironment().DynamicEnv->DebugIndentionLevel;
@@ -56,7 +57,8 @@ DebugEntryRAII::DebugEntryRAII(const char *File, const unsigned Line,
 
 DebugEntryRAII::~DebugEntryRAII() {
   if (config::isDebugMode(config::DebugKind::FunctionTracing) &&
-      mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) {
+      mapping::getThreadIdInBlock() == 0 &&
+      mapping::getBlockIdInKernel() == 0) {
     uint16_t &Level =
         state::getKernelEnvironment().DynamicEnv->DebugIndentionLevel;
     Level--;

diff  --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
index bc1d8d5b3d33f0e..e8f6cfdc6ea5405 100644
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -107,10 +107,11 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment) {
   // reaches its corresponding synchronize::threads call: that would permit all
   // active worker threads to proceed before the main thread has actually set
   // state::ParallelRegionFn, and then they would immediately quit without
-  // doing any work.  mapping::getBlockSize() does not include any of the main
-  // thread's warp, so none of its threads can ever be active worker threads.
+  // doing any work.  mapping::getMaxTeamThreads() does not include any of the
+  // main thread's warp, so none of its threads can ever be active worker
+  // threads.
   if (UseGenericStateMachine &&
-      mapping::getThreadIdInBlock() < mapping::getBlockSize(IsSPMD)) {
+      mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD)) {
     genericStateMachine(KernelEnvironment.Ident);
   } else {
     // Retrieve the work function just to ensure we always call

diff  --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
index 78361284ff8de2b..8f26af086e714de 100644
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -26,16 +26,16 @@ namespace impl {
 
 // Forward declarations defined to be defined for AMDGCN and NVPTX.
 const llvm::omp::GV &getGridValue();
-uint32_t getNumHardwareThreadsInBlock();
 LaneMaskTy activemask();
 LaneMaskTy lanemaskLT();
 LaneMaskTy lanemaskGT();
 uint32_t getThreadIdInWarp();
-uint32_t getThreadIdInBlock();
-uint32_t getKernelSize();
-uint32_t getBlockId();
-uint32_t getNumberOfBlocks();
-uint32_t getWarpId();
+uint32_t getThreadIdInBlock(int32_t Dim);
+uint32_t getNumberOfThreadsInBlock(int32_t Dim);
+uint32_t getNumberOfThreadsInKernel();
+uint32_t getBlockIdInKernel(int32_t Dim);
+uint32_t getNumberOfBlocksInKernel(int32_t Dim);
+uint32_t getWarpIdInBlock();
 uint32_t getNumberOfWarpsInBlock();
 
 /// AMDGCN Implementation
@@ -47,8 +47,16 @@ const llvm::omp::GV &getGridValue() {
   return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
 }
 
-uint32_t getNumHardwareThreadsInBlock() {
-  return __builtin_amdgcn_workgroup_size_x();
+uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
+  switch (Dim) {
+  case 0:
+    return __builtin_amdgcn_workgroup_size_x();
+  case 1:
+    return __builtin_amdgcn_workgroup_size_y();
+  case 2:
+    return __builtin_amdgcn_workgroup_size_z();
+  };
+  UNREACHABLE("Dim outside range!");
 }
 
 LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
@@ -73,22 +81,53 @@ uint32_t getThreadIdInWarp() {
   return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
 }
 
-uint32_t getThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); }
+uint32_t getThreadIdInBlock(int32_t Dim) {
+  switch (Dim) {
+  case 0:
+    return __builtin_amdgcn_workitem_id_x();
+  case 1:
+    return __builtin_amdgcn_workitem_id_y();
+  case 2:
+    return __builtin_amdgcn_workitem_id_z();
+  };
+  UNREACHABLE("Dim outside range!");
+}
 
-uint32_t getKernelSize() { return __builtin_amdgcn_grid_size_x(); }
+uint32_t getNumberOfThreadsInKernel() {
+  return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
+         __builtin_amdgcn_grid_size_z();
+}
 
-uint32_t getBlockId() { return __builtin_amdgcn_workgroup_id_x(); }
+uint32_t getBlockIdInKernel(int32_t Dim) {
+  switch (Dim) {
+  case 0:
+    return __builtin_amdgcn_workgroup_id_x();
+  case 1:
+    return __builtin_amdgcn_workgroup_id_y();
+  case 2:
+    return __builtin_amdgcn_workgroup_id_z();
+  };
+  UNREACHABLE("Dim outside range!");
+}
 
-uint32_t getNumberOfBlocks() {
-  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
+  switch (Dim) {
+  case 0:
+    return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+  case 1:
+    return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+  case 2:
+    return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+  };
+  UNREACHABLE("Dim outside range!");
 }
 
-uint32_t getWarpId() {
-  return impl::getThreadIdInBlock() / mapping::getWarpSize();
+uint32_t getWarpIdInBlock() {
+  return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
 }
 
 uint32_t getNumberOfWarpsInBlock() {
-  return mapping::getBlockSize() / mapping::getWarpSize();
+  return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
 }
 
 #pragma omp end declare variant
@@ -101,8 +140,16 @@ uint32_t getNumberOfWarpsInBlock() {
         device = {arch(nvptx, nvptx64)},                                       \
             implementation = {extension(match_any)})
 
-uint32_t getNumHardwareThreadsInBlock() {
-  return __nvvm_read_ptx_sreg_ntid_x();
+uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
+  switch (Dim) {
+  case 0:
+    return __nvvm_read_ptx_sreg_ntid_x();
+  case 1:
+    return __nvvm_read_ptx_sreg_ntid_y();
+  case 2:
+    return __nvvm_read_ptx_sreg_ntid_z();
+  };
+  UNREACHABLE("Dim outside range!");
 }
 
 const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
@@ -125,27 +172,62 @@ LaneMaskTy lanemaskGT() {
   return Res;
 }
 
-uint32_t getThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
+uint32_t getThreadIdInBlock(int32_t Dim) {
+  switch (Dim) {
+  case 0:
+    return __nvvm_read_ptx_sreg_tid_x();
+  case 1:
+    return __nvvm_read_ptx_sreg_tid_y();
+  case 2:
+    return __nvvm_read_ptx_sreg_tid_z();
+  };
+  UNREACHABLE("Dim outside range!");
+}
 
 uint32_t getThreadIdInWarp() {
-  return impl::getThreadIdInBlock() & (mapping::getWarpSize() - 1);
+  return impl::getThreadIdInBlock(mapping::DIM_X) &
+         (mapping::getWarpSize() - 1);
 }
 
-uint32_t getKernelSize() {
-  return __nvvm_read_ptx_sreg_nctaid_x() *
-         mapping::getNumberOfProcessorElements();
+uint32_t getBlockIdInKernel(int32_t Dim) {
+  switch (Dim) {
+  case 0:
+    return __nvvm_read_ptx_sreg_ctaid_x();
+  case 1:
+    return __nvvm_read_ptx_sreg_ctaid_y();
+  case 2:
+    return __nvvm_read_ptx_sreg_ctaid_z();
+  };
+  UNREACHABLE("Dim outside range!");
 }
 
-uint32_t getBlockId() { return __nvvm_read_ptx_sreg_ctaid_x(); }
+uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
+  switch (Dim) {
+  case 0:
+    return __nvvm_read_ptx_sreg_nctaid_x();
+  case 1:
+    return __nvvm_read_ptx_sreg_nctaid_y();
+  case 2:
+    return __nvvm_read_ptx_sreg_nctaid_z();
+  };
+  UNREACHABLE("Dim outside range!");
+}
 
-uint32_t getNumberOfBlocks() { return __nvvm_read_ptx_sreg_nctaid_x(); }
+uint32_t getNumberOfThreadsInKernel() {
+  return impl::getNumberOfThreadsInBlock(0) *
+         impl::getNumberOfBlocksInKernel(0) *
+         impl::getNumberOfThreadsInBlock(1) *
+         impl::getNumberOfBlocksInKernel(1) *
+         impl::getNumberOfThreadsInBlock(2) *
+         impl::getNumberOfBlocksInKernel(2);
+}
 
-uint32_t getWarpId() {
-  return impl::getThreadIdInBlock() / mapping::getWarpSize();
+uint32_t getWarpIdInBlock() {
+  return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
 }
 
 uint32_t getNumberOfWarpsInBlock() {
-  return (mapping::getBlockSize() + mapping::getWarpSize() - 1) /
+  return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
          mapping::getWarpSize();
 }
 
@@ -162,7 +244,7 @@ uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
 ///{
 
 static bool isInLastWarp() {
-  uint32_t MainTId = (mapping::getNumberOfProcessorElements() - 1) &
+  uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
                      ~(mapping::getWarpSize() - 1);
   return mapping::getThreadIdInBlock() == MainTId;
 }
@@ -203,53 +285,55 @@ uint32_t mapping::getThreadIdInWarp() {
   return ThreadIdInWarp;
 }
 
-uint32_t mapping::getThreadIdInBlock() {
-  uint32_t ThreadIdInBlock = impl::getThreadIdInBlock();
+uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
+  uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
   return ThreadIdInBlock;
 }
 
 uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
 
-uint32_t mapping::getBlockSize(bool IsSPMD) {
-  uint32_t BlockSize =
-      mapping::getNumberOfProcessorElements() - (!IsSPMD * impl::getWarpSize());
-  return BlockSize;
+uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
+  uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
+  // If we are in SPMD mode, remove one warp.
+  return BlockSize - (!IsSPMD * impl::getWarpSize());
 }
-uint32_t mapping::getBlockSize() {
-  return mapping::getBlockSize(mapping::isSPMDMode());
+uint32_t mapping::getMaxTeamThreads() {
+  return mapping::getMaxTeamThreads(mapping::isSPMDMode());
 }
 
-uint32_t mapping::getKernelSize() { return impl::getKernelSize(); }
+uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
+  return impl::getNumberOfThreadsInBlock(Dim);
+}
+
+uint32_t mapping::getNumberOfThreadsInKernel() {
+  return impl::getNumberOfThreadsInKernel();
+}
 
-uint32_t mapping::getWarpId() {
-  uint32_t WarpID = impl::getWarpId();
+uint32_t mapping::getWarpIdInBlock() {
+  uint32_t WarpID = impl::getWarpIdInBlock();
   ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
   return WarpID;
 }
 
-uint32_t mapping::getBlockId() {
-  uint32_t BlockId = impl::getBlockId();
-  ASSERT(BlockId < impl::getNumberOfBlocks(), nullptr);
+uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
+  uint32_t BlockId = impl::getBlockIdInKernel(Dim);
+  ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
   return BlockId;
 }
 
 uint32_t mapping::getNumberOfWarpsInBlock() {
   uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
-  ASSERT(impl::getWarpId() < NumberOfWarpsInBlocks, nullptr);
+  ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
   return NumberOfWarpsInBlocks;
 }
 
-uint32_t mapping::getNumberOfBlocks() {
-  uint32_t NumberOfBlocks = impl::getNumberOfBlocks();
-  ASSERT(impl::getBlockId() < NumberOfBlocks, nullptr);
+uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
+  uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
+  ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
   return NumberOfBlocks;
 }
 
-uint32_t mapping::getNumberOfProcessorElements() {
-  uint32_t NumberOfProcessorElements = impl::getNumHardwareThreadsInBlock();
-  ASSERT(impl::getThreadIdInBlock() < NumberOfProcessorElements, nullptr);
-  return NumberOfProcessorElements;
-}
+uint32_t mapping::getNumberOfProcessorElements() { __builtin_trap(); }
 
 ///}
 
@@ -279,7 +363,7 @@ __attribute__((noinline)) uint32_t __kmpc_get_hardware_thread_id_in_block() {
 
 __attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() {
   FunctionTracingRAII();
-  return impl::getNumHardwareThreadsInBlock();
+  return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
 }
 
 __attribute__((noinline)) uint32_t __kmpc_get_warp_size() {
@@ -287,4 +371,5 @@ __attribute__((noinline)) uint32_t __kmpc_get_warp_size() {
   return impl::getWarpSize();
 }
 }
+
 #pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index 82b944a8bd0a214..84d8f88105f107f 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -49,7 +49,7 @@ namespace {
 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
   uint32_t NThreadsICV =
       NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
-  uint32_t NumThreads = mapping::getBlockSize();
+  uint32_t NumThreads = mapping::getMaxTeamThreads();
 
   if (NThreadsICV != 0 && NThreadsICV < NumThreads)
     NumThreads = NThreadsICV;
@@ -110,8 +110,8 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   ASSERT(state::HasThreadState == false, nullptr);
 
   uint32_t NumThreads = determineNumberOfThreads(num_threads);
-  uint32_t BlockSize = mapping::getBlockSize();
-  uint32_t PTeamSize = NumThreads == BlockSize ? 0 : NumThreads;
+  uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
+  uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
   if (mapping::isSPMDMode()) {
     // Avoid the race between the read of the `icv::Level` above and the write
     // below by synchronizing all threads here.

diff  --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index f544928e33c4098..8e0b91bfc1eb14b 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -91,7 +91,7 @@ static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
   uint32_t WarpsNeeded =
       (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-  uint32_t WarpId = mapping::getWarpId();
+  uint32_t WarpId = mapping::getWarpIdInBlock();
 
   // Volta execution model:
   // For the Generic execution mode a parallel region either has 1 thread and

diff  --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index f824ea2809c2177..c18368da72d6cd3 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -93,7 +93,7 @@ struct SharedMemorySmartStackTy {
 private:
   /// Compute the size of the storage space reserved for a thread.
   uint32_t computeThreadStorageTotal() {
-    uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements();
+    uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
     return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
                              Alignment);
   }
@@ -269,7 +269,7 @@ void state::enterDataEnvironment(IdentTy *Ident) {
       static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
   uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
   if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
-    uint32_t Bytes = sizeof(ThreadStates[0]) * mapping::getBlockSize();
+    uint32_t Bytes = sizeof(ThreadStates[0]) * mapping::getMaxTeamThreads();
     void *ThreadStatesPtr =
         memory::allocGlobal(Bytes, "Thread state array allocation");
     if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
@@ -322,7 +322,7 @@ void state::assumeInitialState(bool IsSPMD) {
 
 int state::getEffectivePTeamSize() {
   int PTeamSize = state::ParallelTeamSize;
-  return PTeamSize ? PTeamSize : mapping::getBlockSize();
+  return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads();
 }
 
 extern "C" {
@@ -334,7 +334,7 @@ void omp_set_num_threads(int V) { icv::NThreads = V; }
 
 int omp_get_max_threads(void) {
   int NT = icv::NThreads;
-  return NT > 0 ? NT : mapping::getBlockSize();
+  return NT > 0 ? NT : mapping::getMaxTeamThreads();
 }
 
 int omp_get_level(void) {
@@ -373,7 +373,7 @@ int omp_get_num_threads(void) {
   return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
 }
 
-int omp_get_thread_limit(void) { return mapping::getBlockSize(); }
+int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
 
 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
 
@@ -415,9 +415,9 @@ int omp_get_num_devices(void) { return config::getNumDevices(); }
 
 int omp_get_device_num(void) { return config::getDeviceNum(); }
 
-int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); }
+int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
 
-int omp_get_team_num() { return mapping::getBlockId(); }
+int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
 
 int omp_get_initial_device(void) { return -1; }
 }

diff  --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index 36536b7a81a164d..cb60aab25babfc8 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -385,7 +385,7 @@ void setLock(omp_lock_t *Lock) {
     for (;;) {
       now = __nvvm_read_ptx_sreg_clock();
       int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
-      if (cycles >= OMP_SPIN * mapping::getBlockId()) {
+      if (cycles >= OMP_SPIN * mapping::getBlockIdInKernel()) {
         break;
       }
     }