[llvm-branch-commits] [openmp] ea616f9 - [libomptarget][devicertl][nfc] Remove some cuda intrinsics, simplify

Wed Jan 20 11:49:53 PST 2021

Author: Jon Chesterfield
Date: 2021-01-20T19:45:05Z
New Revision: ea616f9026dc6bd9c67ebe2d3226ac91122a7945

URL: https://github.com/llvm/llvm-project/commit/ea616f9026dc6bd9c67ebe2d3226ac91122a7945
DIFF: https://github.com/llvm/llvm-project/commit/ea616f9026dc6bd9c67ebe2d3226ac91122a7945.diff

LOG: [libomptarget][devicertl][nfc] Remove some cuda intrinsics, simplify

[libomptarget][devicertl][nfc] Remove some cuda intrinsics, simplify

Replace __popc, __ffs with clang intrinsics. Move kmpc_impl_min to only file
that uses it and replace template with explictly typed.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95060

Added: 
    

Modified: 
    openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
    openmp/libomptarget/deviceRTLs/common/src/reduction.cu
    openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index d25ea8559c05..b1e9a1a9403a 100644

--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -65,6 +65,10 @@ enum DATA_SHARING_SIZES {
   DS_Max_Warp_Number = 16,
 };
 
+enum : __kmpc_impl_lanemask_t {
+  __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
+};
+
 INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
   hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
@@ -74,28 +78,15 @@ INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
   return (((uint64_t)hi) << 32) | (uint64_t)lo;
 }
 
-enum : __kmpc_impl_lanemask_t {
-  __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
-};
-
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
-
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
-
 DEVICE uint32_t __kmpc_impl_smid();
-
 DEVICE double __kmpc_impl_get_wtick();
-
 DEVICE double __kmpc_impl_get_wtime();
 
 INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
-
 INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
 
-template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
-  return x < y ? x : y;
-}
-
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
 
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
index 92b34d77bd8a..3a3c44503f34 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -184,6 +184,8 @@ INLINE static uint32_t roundToWarpsize(uint32_t s) {
   return (s & ~(unsigned)(WARPSIZE - 1));
 }
 
+INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
+
 DEVICE static volatile uint32_t IterCnt = 0;
 DEVICE static volatile uint32_t Cnt = 0;
 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
@@ -261,14 +263,14 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
   //         by returning 1 in the thread holding the reduction result.
 
   // Check if this is the very last team.
-  unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records));
+  unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
   if (ChunkTeamCount == NumTeams - Bound - 1) {
     //
     // Last team processing.
     //
     if (ThreadId >= NumRecs)
       return 0;
-    NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs));
+    NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
     if (ThreadId >= NumThreads)
       return 0;
 
@@ -283,7 +285,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
 
       // When we have more than [warpsize] number of threads
       // a block reduction is performed here.
-      uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads);
+      uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
       if (ActiveThreads > WARPSIZE) {
         uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
         // Gather all the reduced values from each warp

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index 8382cd6aaf47..ab9fd1697f14 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -93,13 +93,8 @@ DEVICE uint32_t __kmpc_impl_smid();
 DEVICE double __kmpc_impl_get_wtick();
 DEVICE double __kmpc_impl_get_wtime();
 
-INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
-
-INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
-
-template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
-  return min(x, y);
-}
+INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
+INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }
 
 #ifndef CUDA_VERSION
 #error CUDA_VERSION macro is undefined, something wrong with cuda.