[Openmp-commits] [PATCH] D71580: [libomptarget][nfc] Wrap cuda min() in target_impl
Jon Chesterfield via Phabricator via Openmp-commits
openmp-commits at lists.llvm.org
Mon Dec 16 16:16:33 PST 2019
JonChesterfield created this revision.
JonChesterfield added reviewers: ABataev, jdoerfert.
Herald added subscribers: openmp-commits, mgorny, jvesely.
Herald added a project: OpenMP.
JonChesterfield marked an inline comment as done.
JonChesterfield added inline comments.
================
Comment at: openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu:483
// Check if this is the very last team.
- unsigned NumRecs = min(NumTeams, num_of_records);
+ unsigned NumRecs = __kmpc_impl_min<uint32_t>(NumTeams, num_of_records);
if (ChunkTeamCount == NumTeams - Bound - 1) {
----------------
Here, NumTeams is a uint32_t and num_of_records is an int32_t. I quite like the `<>` sigil calling attention to this, but am open to alternatives.
[libomptarget][nfc] Wrap cuda min() in target_impl
nvptx forwards to cuda min, amdgcn implements directly.
Sufficient to build parallel.cu for amdgcn, added to CMakeLists.
All call sites are homogenous except one that passes a uint32_t and an
int32_t. This could be smoothed over by taking two type parameters
and some care over the return type, but overall I think the inline
<uint32_t> calling attention to what was an implicit sign conversion
is cleaner.
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D71580
Files:
openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
openmp/libomptarget/deviceRTLs/common/src/parallel.cu
openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -104,6 +104,10 @@
INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
+template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
+ return min(x, y);
+}
+
#ifndef CUDA_VERSION
#error CUDA_VERSION macro is undefined, something wrong with cuda.
#endif
Index: openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -480,14 +480,14 @@
// by returning 1 in the thread holding the reduction result.
// Check if this is the very last team.
- unsigned NumRecs = min(NumTeams, num_of_records);
+ unsigned NumRecs = __kmpc_impl_min<uint32_t>(NumTeams, num_of_records);
if (ChunkTeamCount == NumTeams - Bound - 1) {
//
// Last team processing.
//
if (ThreadId >= NumRecs)
return 0;
- NumThreads = roundToWarpsize(min(NumThreads, NumRecs));
+ NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs));
if (ThreadId >= NumThreads)
return 0;
@@ -502,7 +502,7 @@
// When we have more than [warpsize] number of threads
// a block reduction is performed here.
- uint32_t ActiveThreads = min(NumRecs, NumThreads);
+ uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads);
if (ActiveThreads > WARPSIZE) {
uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
// Gather all the reduced values from each warp
Index: openmp/libomptarget/deviceRTLs/common/src/parallel.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -72,7 +72,7 @@
// We cannot have more than the # of convergent threads.
if (SimdLimitSource > 0)
- *NumLanes = min(ConvergentSize, SimdLimitSource);
+ *NumLanes = __kmpc_impl_min(ConvergentSize, SimdLimitSource);
else
*NumLanes = ConvergentSize;
ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
@@ -149,7 +149,7 @@
// We cannot have more than the # of convergent threads.
uint16_t NumThreads;
if (NumThreadsSource > 0)
- NumThreads = min(ConvergentSize, NumThreadsSource);
+ NumThreads = __kmpc_impl_min(ConvergentSize, NumThreadsSource);
else
NumThreads = ConvergentSize;
ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
Index: openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
===================================================================
--- openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -109,6 +109,10 @@
INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
+template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
+ return x < y ? x : y;
+}
+
INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
return __ballot64(1);
}
Index: openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
===================================================================
--- openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@@ -59,6 +59,7 @@
${devicertl_base_directory}/common/src/critical.cu
${devicertl_base_directory}/common/src/loop.cu
${devicertl_base_directory}/common/src/omptarget.cu
+ ${devicertl_base_directory}/common/src/parallel.cu
${devicertl_base_directory}/common/src/sync.cu
${devicertl_base_directory}/common/src/task.cu)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D71580.234185.patch
Type: text/x-patch
Size: 3865 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/openmp-commits/attachments/20191217/f974a251/attachment.bin>
More information about the Openmp-commits
mailing list