[Openmp-commits] [openmp] r370144 - [libomptarget] Refactor shfl_sync macro to inline function
Jon Chesterfield via Openmp-commits
openmp-commits at lists.llvm.org
Tue Aug 27 18:31:05 PDT 2019
Author: jonchesterfield
Date: Tue Aug 27 18:31:04 2019
New Revision: 370144
URL: http://llvm.org/viewvc/llvm-project?rev=370144&view=rev
Log:
[libomptarget] Refactor shfl_sync macro to inline function
Summary:
[libomptarget] Refactor shfl_sync macro to inline function
See also abandoned D66846, split into this diff and others.
Reviewers: jdoerfert, ABataev, grokos, ronlieb, gregrodgers
Subscribers: openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D66852
Modified:
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/target_impl.h
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu?rev=370144&r1=370143&r2=370144&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu Tue Aug 27 18:31:04 2019
@@ -10,6 +10,7 @@
//
//===----------------------------------------------------------------------===//
#include "omptarget-nvptx.h"
+#include "target_impl.h"
#include <stdio.h>
// Warp ID in the CUDA block
@@ -430,9 +431,10 @@ INLINE static void* data_sharing_push_st
}
}
// Get address from lane 0.
- ((int *)&FrameP)[0] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[0], 0);
+ int *FP = (int *)&FrameP;
+ FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
if (sizeof(FrameP) == 8)
- ((int *)&FrameP)[1] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[1], 0);
+ FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
return FrameP;
}
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu?rev=370144&r1=370143&r2=370144&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu Tue Aug 27 18:31:04 2019
@@ -383,8 +383,8 @@ public:
INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
int lo, hi;
__kmpc_impl_unpack(val, lo, hi);
- hi = __SHFL_SYNC(active, hi, leader);
- lo = __SHFL_SYNC(active, lo, leader);
+ hi = __kmpc_impl_shfl_sync(active, hi, leader);
+ lo = __kmpc_impl_shfl_sync(active, lo, leader);
return __kmpc_impl_pack(lo, hi);
}
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h?rev=370144&r1=370143&r2=370144&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h Tue Aug 27 18:31:04 2019
@@ -51,13 +51,11 @@
#ifndef CUDA_VERSION
#error CUDA_VERSION macro is undefined, something wrong with cuda.
#elif CUDA_VERSION >= 9000
-#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
__shfl_down_sync((mask), (var), (delta), (width))
#define __ACTIVEMASK() __activemask()
#define __SYNCWARP(Mask) __syncwarp(Mask)
#else
-#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
__shfl_down((var), (delta), (width))
#define __ACTIVEMASK() __ballot(1)
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu?rev=370144&r1=370143&r2=370144&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu Tue Aug 27 18:31:04 2019
@@ -33,6 +33,7 @@
//===----------------------------------------------------------------------===//
#include "omptarget-nvptx.h"
+#include "target_impl.h"
typedef struct ConvergentSimdJob {
omptarget_nvptx_TaskDescr taskDescr;
@@ -64,7 +65,7 @@ EXTERN bool __kmpc_kernel_convergent_sim
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
job->slimForNextSimd = SimdLimit;
- int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource);
+ int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource);
// reset simdlimit to avoid propagating to successive #simd
if (SimdLimitSource > 0 && threadId == sourceThreadId)
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
@@ -138,7 +139,8 @@ EXTERN bool __kmpc_kernel_convergent_par
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
job->tnumForNextPar = NumThreadsClause;
- int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource);
+ int32_t NumThreadsSource =
+ __kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource);
// reset numthreads to avoid propagating to successive #parallel
if (NumThreadsSource > 0 && threadId == sourceThreadId)
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/target_impl.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/target_impl.h?rev=370144&r1=370143&r2=370144&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/target_impl.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/target_impl.h Tue Aug 27 18:31:04 2019
@@ -38,6 +38,20 @@ INLINE int __kmpc_impl_ffs(uint32_t x) {
INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); }
+#ifndef CUDA_VERSION
+#error CUDA_VERSION macro is undefined, something wrong with cuda.
+#endif
+
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
+ int32_t SrcLane) {
+#if CUDA_VERSION >= 9000
+ return __shfl_sync(Mask, Var, SrcLane);
+#else
+ return __shfl(Var, SrcLane);
+#endif // CUDA_VERSION
+}
+
INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); }
#endif
More information about the Openmp-commits
mailing list