[Openmp-commits] [PATCH] D66853: [libomptarget] Refactor shfl_down_sync macro to inline function
Jon Chesterfield via Phabricator via Openmp-commits
openmp-commits at lists.llvm.org
Tue Aug 27 18:08:14 PDT 2019
JonChesterfield created this revision.
JonChesterfield added reviewers: jdoerfert, ABataev, grokos, ronlieb, gregrodgers.
Herald added a project: OpenMP.
Herald added a subscriber: openmp-commits.
[libomptarget] Refactor shfl_down_sync macro to inline function
See also abandoned D66846 <https://reviews.llvm.org/D66846>, split into this diff and others.
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D66853
Files:
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -38,6 +38,22 @@
INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); }
+#ifndef CUDA_VERSION
+#error CUDA_VERSION macro is undefined, something wrong with cuda.
+#endif
+
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+
+INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
+ int32_t Var, uint32_t Delta,
+ int32_t Width) {
+#if CUDA_VERSION >= 9000
+ return __shfl_down_sync(Mask, Var, Delta, Width);
+#else
+ return __shfl_down(Var, Delta, Width);
+#endif
+}
+
INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); }
#endif
Index: openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -15,6 +15,7 @@
#include <stdio.h>
#include "omptarget-nvptx.h"
+#include "target_impl.h"
EXTERN
void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
@@ -23,14 +24,14 @@
void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
- return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size);
+ return __kmpc_impl_shfl_down_sync(0xFFFFFFFF, val, delta, size);
}
EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
int lo, hi;
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
- hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size);
- lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size);
+ hi = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, hi, delta, size);
+ lo = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, lo, delta, size);
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
return val;
}
Index: openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -52,14 +52,10 @@
#error CUDA_VERSION macro is undefined, something wrong with cuda.
#elif CUDA_VERSION >= 9000
#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
-#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
- __shfl_down_sync((mask), (var), (delta), (width))
#define __ACTIVEMASK() __activemask()
#define __SYNCWARP(Mask) __syncwarp(Mask)
#else
#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
-#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
- __shfl_down((var), (delta), (width))
#define __ACTIVEMASK() __ballot(1)
// In Cuda < 9.0 no need to sync threads in warps.
#define __SYNCWARP(Mask)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D66853.217552.patch
Type: text/x-patch
Size: 3117 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/openmp-commits/attachments/20190828/2fa8cb3a/attachment.bin>
More information about the Openmp-commits
mailing list