[Openmp-commits] [openmp] 53bcd1e - [libomptarget][nfc] Wrap cuda min() in target_impl

Mon Dec 16 17:30:19 PST 2019

Author: Jon Chesterfield
Date: 2019-12-17T01:30:04Z
New Revision: 53bcd1e1413c878d2d988df80142a430a9abf24a

URL: https://github.com/llvm/llvm-project/commit/53bcd1e1413c878d2d988df80142a430a9abf24a
DIFF: https://github.com/llvm/llvm-project/commit/53bcd1e1413c878d2d988df80142a430a9abf24a.diff

LOG: [libomptarget][nfc] Wrap cuda min() in target_impl

Summary:
[libomptarget][nfc] Wrap cuda min() in target_impl

nvptx forwards to cuda min, amdgcn implements directly.
Sufficient to build parallel.cu for amdgcn, added to CMakeLists.

All call sites are homogenous except one that passes a uint32_t and an
int32_t. This could be smoothed over by taking two type parameters
and some care over the return type, but overall I think the inline
<uint32_t> calling attention to what was an implicit sign conversion
is cleaner.

Reviewers: ABataev, jdoerfert

Reviewed By: jdoerfert

Subscribers: jvesely, mgorny, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D71580

Added: 
    

Modified: 
    openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
    openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
    openmp/libomptarget/deviceRTLs/common/src/parallel.cu
    openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
    openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
index 671508aac418..802ab0b42cc5 100644

--- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@@ -59,6 +59,7 @@ set(cuda_sources
   ${devicertl_base_directory}/common/src/critical.cu
   ${devicertl_base_directory}/common/src/loop.cu
   ${devicertl_base_directory}/common/src/omptarget.cu
+  ${devicertl_base_directory}/common/src/parallel.cu
   ${devicertl_base_directory}/common/src/sync.cu
   ${devicertl_base_directory}/common/src/task.cu)
 

diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index 5082d469d050..858a023eb8d6 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -109,6 +109,10 @@ INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
 
 INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
 
+template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
+  return x < y ? x : y;
+}
+
 INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
   return __ballot64(1);
 }

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
index 4934621de58d..2a02c69e7e84 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -72,7 +72,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
 
   // We cannot have more than the # of convergent threads.
   if (SimdLimitSource > 0)
-    *NumLanes = min(ConvergentSize, SimdLimitSource);
+    *NumLanes = __kmpc_impl_min(ConvergentSize, SimdLimitSource);
   else
     *NumLanes = ConvergentSize;
   ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
@@ -149,7 +149,7 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
   // We cannot have more than the # of convergent threads.
   uint16_t NumThreads;
   if (NumThreadsSource > 0)
-    NumThreads = min(ConvergentSize, NumThreadsSource);
+    NumThreads = __kmpc_impl_min(ConvergentSize, NumThreadsSource);
   else
     NumThreads = ConvergentSize;
   ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
index cfccf78c377a..fa9c130c0fcc 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -480,14 +480,14 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
   //         by returning 1 in the thread holding the reduction result.
 
   // Check if this is the very last team.
-  unsigned NumRecs = min(NumTeams, num_of_records);
+  unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records));
   if (ChunkTeamCount == NumTeams - Bound - 1) {
     //
     // Last team processing.
     //
     if (ThreadId >= NumRecs)
       return 0;
-    NumThreads = roundToWarpsize(min(NumThreads, NumRecs));
+    NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs));
     if (ThreadId >= NumThreads)
       return 0;
 
@@ -502,7 +502,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
 
       // When we have more than [warpsize] number of threads
       // a block reduction is performed here.
-      uint32_t ActiveThreads = min(NumRecs, NumThreads);
+      uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads);
       if (ActiveThreads > WARPSIZE) {
         uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
         // Gather all the reduced values from each warp

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index 161cd6cac110..4bb66776a2aa 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -104,6 +104,10 @@ INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
 
 INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
 
+template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
+  return min(x, y);
+}
+
 #ifndef CUDA_VERSION
 #error CUDA_VERSION macro is undefined, something wrong with cuda.
 #endif