r272299 - [CUDA] Implement __shfl* intrinsics in clang headers.
Justin Lebar via cfe-commits
cfe-commits at lists.llvm.org
Thu Jun 9 13:04:57 PDT 2016
Author: jlebar
Date: Thu Jun 9 15:04:57 2016
New Revision: 272299
URL: http://llvm.org/viewvc/llvm-project?rev=272299&view=rev
Log:
[CUDA] Implement __shfl* intrinsics in clang headers.
Summary: Clang changes to make use of the LLVM intrinsics added in D21160.
Reviewers: tra
Subscribers: jholewinski, cfe-commits
Differential Revision: http://reviews.llvm.org/D21162
Modified:
cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def
cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h
cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h
Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def?rev=272299&r1=272298&r2=272299&view=diff
==============================================================================
--- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def (original)
+++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def Thu Jun 9 15:04:57 2016
@@ -402,6 +402,17 @@ BUILTIN(__nvvm_bar0_popc, "ii", "")
BUILTIN(__nvvm_bar0_and, "ii", "")
BUILTIN(__nvvm_bar0_or, "ii", "")
+// Shuffle
+
+BUILTIN(__builtin_ptx_shfl_down_i32, "iiii", "")
+BUILTIN(__builtin_ptx_shfl_down_f32, "ffii", "")
+BUILTIN(__builtin_ptx_shfl_up_i32, "iiii", "")
+BUILTIN(__builtin_ptx_shfl_up_f32, "ffii", "")
+BUILTIN(__builtin_ptx_shfl_bfly_i32, "iiii", "")
+BUILTIN(__builtin_ptx_shfl_bfly_f32, "ffii", "")
+BUILTIN(__builtin_ptx_shfl_idx_i32, "iiii", "")
+BUILTIN(__builtin_ptx_shfl_idx_f32, "ffii", "")
+
// Membar
BUILTIN(__nvvm_membar_cta, "v", "")
Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h?rev=272299&r1=272298&r2=272299&view=diff
==============================================================================
--- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h (original)
+++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Thu Jun 9 15:04:57 2016
@@ -26,6 +26,76 @@
#error "This file is for CUDA compilation only."
#endif
+// sm_30 intrinsics: __shfl_{up,down,xor}.
+
+#define __SM_30_INTRINSICS_H__
+#define __SM_30_INTRINSICS_HPP__
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#pragma push_macro("__MAKE_SHUFFLES")
+#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \
+ inline __device__ int __FnName(int __in, int __offset, \
+ int __width = warpSize) { \
+ return __IntIntrinsic(__in, __offset, \
+ ((warpSize - __width) << 8) | (__Mask)); \
+ } \
+ inline __device__ float __FnName(float __in, int __offset, \
+ int __width = warpSize) { \
+ return __FloatIntrinsic(__in, __offset, \
+ ((warpSize - __width) << 8) | (__Mask)); \
+ } \
+ inline __device__ unsigned int __FnName(unsigned int __in, int __offset, \
+ int __width = warpSize) { \
+ return static_cast<unsigned int>( \
+ ::__FnName(static_cast<int>(__in), __offset, __width)); \
+ } \
+ inline __device__ long long __FnName(long long __in, int __offset, \
+ int __width = warpSize) { \
+ struct __Bits { \
+ int __a, __b; \
+ }; \
+ _Static_assert(sizeof(__in) == sizeof(__Bits)); \
+ _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
+ __Bits __tmp; \
+ memcpy(&__in, &__tmp, sizeof(__in)); \
+ __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \
+ __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \
+ long long __out; \
+ memcpy(&__out, &__tmp, sizeof(__tmp)); \
+ return __out; \
+ } \
+ inline __device__ unsigned long long __FnName( \
+ unsigned long long __in, int __offset, int __width = warpSize) { \
+ return static_cast<unsigned long long>( \
+ ::__FnName(static_cast<unsigned long long>(__in), __offset, __width)); \
+ } \
+ inline __device__ double __FnName(double __in, int __offset, \
+ int __width = warpSize) { \
+ long long __tmp; \
+ _Static_assert(sizeof(__tmp) == sizeof(__in)); \
+ memcpy(&__tmp, &__in, sizeof(__in)); \
+ __tmp = ::__FnName(__tmp, __offset, __width); \
+ double __out; \
+ memcpy(&__out, &__tmp, sizeof(__out)); \
+ return __out; \
+ }
+
+__MAKE_SHUFFLES(__shfl, __builtin_ptx_shfl_idx_i32, __builtin_ptx_shfl_idx_f32,
+ 0x1f);
+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
+// maxLane.
+__MAKE_SHUFFLES(__shfl_up, __builtin_ptx_shfl_up_i32, __builtin_ptx_shfl_up_f32,
+ 0);
+__MAKE_SHUFFLES(__shfl_down, __builtin_ptx_shfl_down_i32,
+ __builtin_ptx_shfl_down_f32, 0x1f);
+__MAKE_SHUFFLES(__shfl_xor, __builtin_ptx_shfl_bfly_i32,
+ __builtin_ptx_shfl_bfly_f32, 0x1f);
+
+#pragma pop_macro("__MAKE_SHUFFLES")
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
// Prevent the vanilla sm_32 intrinsics header from being included.
Modified: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h?rev=272299&r1=272298&r2=272299&view=diff
==============================================================================
--- cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h (original)
+++ cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h Thu Jun 9 15:04:57 2016
@@ -198,13 +198,14 @@ static inline __device__ void __brkpt(in
#include "sm_20_atomic_functions.hpp"
#include "sm_20_intrinsics.hpp"
#include "sm_32_atomic_functions.hpp"
-// sm_30_intrinsics.h has declarations that use default argument, so
-// we have to include it and it will in turn include .hpp
-#include "sm_30_intrinsics.h"
-// Don't include sm_32_intrinsics.h. That header defines __ldg using inline
-// asm, but we want to define it using builtins, because we can't use the
-// [addr+imm] addressing mode if we use the inline asm in the header.
+// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h. These define the
+// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
+// define them using builtins so that the optimizer can reason about and across
+// these instructions. In particular, using intrinsics for ldg gets us the
+// [addr+imm] addressing mode, which, although it doesn't actually exist in the
+// hardware, seems to generate faster machine code because ptxas can more easily
+// reason about our code.
#undef __MATH_FUNCTIONS_HPP__
More information about the cfe-commits
mailing list