[llvm] 5a682d9 - [OpenMP] Expose libomptarget function to get HW thread id

Giorgis Georgakoudis via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 21 10:26:13 PDT 2021


Author: Giorgis Georgakoudis
Date: 2021-07-21T10:26:04-07:00
New Revision: 5a682d9b9109494cb46e16b493cf4afc5e25e598

URL: https://github.com/llvm/llvm-project/commit/5a682d9b9109494cb46e16b493cf4afc5e25e598
DIFF: https://github.com/llvm/llvm-project/commit/5a682d9b9109494cb46e16b493cf4afc5e25e598.diff

LOG: [OpenMP] Expose libomptarget function to get HW thread id

The patch exposes the libomptarget runtime function that gets the hardware thread id through the kmpc API. This is to be used in SPMDization for checking the thread id to execute regions by a single thread in a block.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D106323

Added: 
    

Modified: 
    llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
    openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
    openmp/libomptarget/deviceRTLs/common/debug.h
    openmp/libomptarget/deviceRTLs/common/omptargeti.h
    openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
    openmp/libomptarget/deviceRTLs/common/src/libcall.cu
    openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
    openmp/libomptarget/deviceRTLs/common/src/parallel.cu
    openmp/libomptarget/deviceRTLs/common/src/reduction.cu
    openmp/libomptarget/deviceRTLs/common/src/support.cu
    openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
    openmp/libomptarget/deviceRTLs/target_interface.h

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index f227fc65fe50e..93ab3f6c79990 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -195,6 +195,7 @@ __OMP_RTL(__kmpc_cancel, false, Int32, IdentPtr, Int32, Int32)
 __OMP_RTL(__kmpc_cancel_barrier, false, Int32, IdentPtr, Int32)
 __OMP_RTL(__kmpc_flush, false, Void, IdentPtr)
 __OMP_RTL(__kmpc_global_thread_num, false, Int32, IdentPtr)
+__OMP_RTL(__kmpc_get_hardware_thread_id_in_block, false, Int32, )
 __OMP_RTL(__kmpc_fork_call, true, Void, IdentPtr, Int32, ParallelTaskPtr)
 __OMP_RTL(__kmpc_omp_taskwait, false, Int32, IdentPtr, Int32)
 __OMP_RTL(__kmpc_omp_taskyield, false, Int32, IdentPtr, Int32, /* Int */ Int32)

diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
index 8398b4d6f0535..2bbeab73b136d 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -133,7 +133,7 @@ EXTERN int GetNumberOfThreadsInBlock() {
                            __builtin_amdgcn_workgroup_size_x());
 }
 
-EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; }
 EXTERN unsigned GetWarpSize() { return WARPSIZE; }
 EXTERN unsigned GetLaneId() {
   return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
@@ -212,7 +212,7 @@ EXTERN void __kmpc_impl_threadfence_system() {
 }
 
 // Calls to the AMDGCN layer (assuming 1D layout)
-EXTERN int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); }
+EXTERN int __kmpc_get_hardware_thread_id_in_block() { return __builtin_amdgcn_workitem_id_x(); }
 EXTERN int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); }
 
 #pragma omp end declare target

diff  --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h
index 3b2895e22a29b..99c9b6cd58183 100644
--- a/openmp/libomptarget/deviceRTLs/common/debug.h
+++ b/openmp/libomptarget/deviceRTLs/common/debug.h
@@ -132,8 +132,9 @@
 
 template <typename... Arguments>
 NOINLINE static void log(const char *fmt, Arguments... parameters) {
-  printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
-         (int)GetWarpId(), (int)GetLaneId(), parameters...);
+  printf(fmt, (int)GetBlockIdInKernel(),
+         (int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(),
+         (int)GetLaneId(), parameters...);
 }
 
 #endif
@@ -143,8 +144,9 @@ template <typename... Arguments>
 NOINLINE static void check(bool cond, const char *fmt,
                            Arguments... parameters) {
   if (!cond) {
-    printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
-           (int)GetWarpId(), (int)GetLaneId(), parameters...);
+    printf(fmt, (int)GetBlockIdInKernel(),
+           (int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(),
+           (int)GetLaneId(), parameters...);
     __builtin_trap();
   }
 }

diff  --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
index 02feaf5f30bdc..485e30cc8d72c 100644
--- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
@@ -54,7 +54,8 @@ INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
 
   items.flags = TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
   items.threadId =
-      GetThreadIdInBlock();   // get ids from cuda (only called for 1st level)
+      __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
+                                                // called for 1st level)
   items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
   prev = parentTaskDescr;
 }
@@ -97,16 +98,16 @@ INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
   //
   // overwrite specific items;
   //
-  // The threadID should be GetThreadIdInBlock() % GetMasterThreadID().
-  // This is so that the serial master (first lane in the master warp)
-  // gets a threadId of 0.
-  // However, we know that this function is always called in a parallel
-  // region where only workers are active.  The serial master thread
-  // never enters this region.  When a parallel region is executed serially,
-  // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions
-  // are called, which never activate this region.
+  // The threadID should be __kmpc_get_hardware_thread_id_in_block() %
+  // GetMasterThreadID(). This is so that the serial master (first lane in the
+  // master warp) gets a threadId of 0. However, we know that this function is
+  // always called in a parallel region where only workers are active.  The
+  // serial master thread never enters this region.  When a parallel region is
+  // executed serially, the threadId is set to 0 elsewhere and the
+  // kmpc_serialized_* functions are called, which never activate this region.
   items.threadId =
-      GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
+      __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
+                                                // called for 1st level)
 }
 
 INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
index 445e8c1faf0ae..d8bb7ea12bc37 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@@ -48,7 +48,8 @@ static void *__kmpc_alloc_for_warp(AllocTy Alloc, unsigned Bytes,
   void *Ptr;
   __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
   unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1;
-  bool IsWarpLeader = (GetThreadIdInBlock() % WARPSIZE) == LeaderID;
+  bool IsWarpLeader =
+      (__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID;
   if (IsWarpLeader)
     Ptr = Alloc();
   // Get address from the first active lane.
@@ -61,7 +62,7 @@ static void *__kmpc_alloc_for_warp(AllocTy Alloc, unsigned Bytes,
 
 EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
   Bytes = Bytes + (Bytes % MinBytes);
-  int TID = GetThreadIdInBlock();
+  int TID = __kmpc_get_hardware_thread_id_in_block();
   if (__kmpc_is_generic_main_thread(TID)) {
     // Main thread alone, use shared memory if space is available.
     if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) {
@@ -97,7 +98,8 @@ EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
 EXTERN void __kmpc_free_shared(void *Ptr) {
   __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
   unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1;
-  bool IsWarpLeader = (GetThreadIdInBlock() % WARPSIZE) == LeaderID;
+  bool IsWarpLeader =
+      (__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID;
   __kmpc_syncwarp(CurActive);
   if (IsWarpLeader) {
     if (Ptr >= &MainSharedStack.Data[0] &&
@@ -190,13 +192,14 @@ EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
     return;
   }
   if (isSPMDExecutionMode) {
-    if (GetThreadIdInBlock() == 0) {
+    if (__kmpc_get_hardware_thread_id_in_block() == 0) {
       *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
     }
     __kmpc_impl_syncthreads();
     return;
   }
-  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+  ASSERT0(LT_FUSSY,
+          __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
           "Must be called only in the target master thread.");
   *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
   __kmpc_impl_threadfence();
@@ -208,13 +211,14 @@ EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
     return;
   if (isSPMDExecutionMode) {
     __kmpc_impl_syncthreads();
-    if (GetThreadIdInBlock() == 0) {
+    if (__kmpc_get_hardware_thread_id_in_block() == 0) {
       omptarget_nvptx_simpleMemoryManager.Release();
     }
     return;
   }
   __kmpc_impl_threadfence();
-  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+  ASSERT0(LT_FUSSY,
+          __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
           "Must be called only in the target master thread.");
   omptarget_nvptx_simpleMemoryManager.Release();
 }

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
index 49d6d9f835780..9c62b5bb14c09 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@@ -142,7 +142,7 @@ EXTERN int omp_get_active_level(void) {
 
 EXTERN int omp_get_ancestor_thread_num(int level) {
   if (__kmpc_is_spmd_exec_mode())
-    return level == 1 ? GetThreadIdInBlock() : 0;
+    return level == 1 ? __kmpc_get_hardware_thread_id_in_block() : 0;
   int rc = -1;
   // If level is 0 or all parallel regions are not active - return 0.
   unsigned parLevel = parallelLevel[GetWarpId()];

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
index 21608549edf1b..3b620de71055a 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -34,7 +34,7 @@ static void __kmpc_generic_kernel_init() {
   if (GetLaneId() == 0)
     parallelLevel[GetWarpId()] = 0;
 
-  int threadIdInBlock = GetThreadIdInBlock();
+  int threadIdInBlock = __kmpc_get_hardware_thread_id_in_block();
   if (threadIdInBlock != GetMasterThreadID())
     return;
 
@@ -87,7 +87,7 @@ static void __kmpc_spmd_kernel_init(bool RequiresFullRuntime) {
 
   setExecutionParameters(Spmd, RequiresFullRuntime ? RuntimeInitialized
                          : RuntimeUninitialized);
-  int threadId = GetThreadIdInBlock();
+  int threadId = __kmpc_get_hardware_thread_id_in_block();
   if (threadId == 0) {
     usedSlotIdx = __kmpc_impl_smid() % MAX_SM;
   }
@@ -147,7 +147,7 @@ static void __kmpc_spmd_kernel_deinit(bool RequiresFullRuntime) {
     return;
 
   __kmpc_impl_syncthreads();
-  int threadId = GetThreadIdInBlock();
+  int threadId = __kmpc_get_hardware_thread_id_in_block();
   if (threadId == 0) {
     // Enqueue omp state object for use by another team.
     int slot = usedSlotIdx;
@@ -169,7 +169,7 @@ EXTERN bool __kmpc_kernel_parallel(void**WorkFn);
 
 static void __kmpc_target_region_state_machine(ident_t *Ident) {
 
-  int TId = GetThreadIdInBlock();
+  int TId = __kmpc_get_hardware_thread_id_in_block();
   do {
     void* WorkFn = 0;
 
@@ -199,7 +199,7 @@ EXTERN
 int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD,
                            bool UseGenericStateMachine,
                            bool RequiresFullRuntime) {
-  int TId = GetThreadIdInBlock();
+  int TId = __kmpc_get_hardware_thread_id_in_block();
   if (IsSPMD)
     __kmpc_spmd_kernel_init(RequiresFullRuntime);
   else

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
index f30a5b58be880..839816335c6e6 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -105,7 +105,8 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn) {
 
   ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
          (int)NumThreads);
-  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+  ASSERT0(LT_FUSSY,
+          __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
           "only team master can create parallel");
 
   // Set number of threads on work descriptor.
@@ -133,7 +134,7 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn) {
 
   // Only the worker threads call this routine and the master warp
   // never arrives here.  Therefore, use the nvptx thread id.
-  int threadId = GetThreadIdInBlock();
+  int threadId = __kmpc_get_hardware_thread_id_in_block();
   omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
   // Set to true for workers participating in the parallel region.
   bool isActive = false;
@@ -166,7 +167,7 @@ EXTERN void __kmpc_kernel_end_parallel() {
 
   // Only the worker threads call this routine and the master warp
   // never arrives here.  Therefore, use the nvptx thread id.
-  int threadId = GetThreadIdInBlock();
+  int threadId = __kmpc_get_hardware_thread_id_in_block();
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
       threadId, currTaskDescr->GetPrevTaskDescr());

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
index 6c02790ac0aae..3a658f50a9960 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -47,7 +47,7 @@ INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
 INLINE static uint32_t
 gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
   uint32_t size, remote_id, physical_lane_id;
-  physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
+  physical_lane_id = __kmpc_get_hardware_thread_id_in_block() % WARPSIZE;
   __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
   __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
   uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
@@ -95,9 +95,10 @@ static int32_t nvptx_parallel_reduce_nowait(
   if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
     gpu_regular_warp_reduce(reduce_data, shflFct);
   else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/NumThreads % WARPSIZE,
-                              /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
+    gpu_irregular_warp_reduce(
+        reduce_data, shflFct,
+        /*LaneCount=*/NumThreads % WARPSIZE,
+        /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
 
   // When we have more than [warpsize] number of threads
   // a block reduction is performed here.
@@ -118,9 +119,10 @@ static int32_t nvptx_parallel_reduce_nowait(
   if (Liveness == __kmpc_impl_all_lanes) // Full warp
     gpu_regular_warp_reduce(reduce_data, shflFct);
   else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/__kmpc_impl_popc(Liveness),
-                              /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
+    gpu_irregular_warp_reduce(
+        reduce_data, shflFct,
+        /*LaneCount=*/__kmpc_impl_popc(Liveness),
+        /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
   else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
                                     // parallel region may enter here; return
                                     // early.
@@ -185,7 +187,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
 
   // Terminate all threads in non-SPMD mode except for the master thread.
   if (!__kmpc_is_spmd_exec_mode() &&
-      !__kmpc_is_generic_main_thread(GetThreadIdInBlock()))
+      !__kmpc_is_generic_main_thread(__kmpc_get_hardware_thread_id_in_block()))
     return 0;
 
   uint32_t ThreadId = GetLogicalThreadIdInBlock();

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu
index d78f94c0b1b5e..3d93ea00fbae9 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu
@@ -70,7 +70,7 @@ int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
 int GetLogicalThreadIdInBlock() {
   // Implemented using control flow (predication) instead of with a modulo
   // operation.
-  int tid = GetThreadIdInBlock();
+  int tid = __kmpc_get_hardware_thread_id_in_block();
   if (__kmpc_is_generic_main_thread(tid))
     return 0;
   else
@@ -84,7 +84,7 @@ int GetLogicalThreadIdInBlock() {
 ////////////////////////////////////////////////////////////////////////////////
 
 int GetOmpThreadId() {
-  int tid = GetThreadIdInBlock();
+  int tid = __kmpc_get_hardware_thread_id_in_block();
   if (__kmpc_is_generic_main_thread(tid))
     return 0;
   // omp_thread_num

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
index 35324f070e4d6..c1b4007f1dcb7 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -60,7 +60,7 @@ EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
   return Mask;
 }
 
-EXTERN void __kmpc_impl_syncthreads() { 
+EXTERN void __kmpc_impl_syncthreads() {
   int barrier = 2;
   asm volatile("barrier.sync %0;"
                :
@@ -92,15 +92,21 @@ EXTERN void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
 EXTERN void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
 
 // Calls to the NVPTX layer (assuming 1D layout)
-EXTERN int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
+EXTERN int __kmpc_get_hardware_thread_id_in_block() {
+  return __nvvm_read_ptx_sreg_tid_x();
+}
 EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
 EXTERN int GetNumberOfBlocksInKernel() {
   return __nvvm_read_ptx_sreg_nctaid_x();
 }
 EXTERN int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
-EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+EXTERN unsigned GetWarpId() {
+  return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE;
+}
 EXTERN unsigned GetWarpSize() { return WARPSIZE; }
-EXTERN unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
+EXTERN unsigned GetLaneId() {
+  return __kmpc_get_hardware_thread_id_in_block() & (WARPSIZE - 1);
+}
 
 // Atomics
 uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {

diff  --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h
index c5141c9d6fc14..7e6ae0dc16da6 100644
--- a/openmp/libomptarget/deviceRTLs/target_interface.h
+++ b/openmp/libomptarget/deviceRTLs/target_interface.h
@@ -16,7 +16,7 @@
 #include "target_impl.h"
 
 // Calls to the NVPTX layer (assuming 1D layout)
-EXTERN int GetThreadIdInBlock();
+EXTERN int __kmpc_get_hardware_thread_id_in_block();
 EXTERN int GetBlockIdInKernel();
 EXTERN int GetNumberOfBlocksInKernel();
 EXTERN int GetNumberOfThreadsInBlock();


        


More information about the llvm-commits mailing list