r313820 - [NVPTX] Implemented shfl.sync instruction and supporting intrinsics/builtins.
Artem Belevich via cfe-commits
cfe-commits at lists.llvm.org
Wed Sep 20 14:23:07 PDT 2017
Author: tra
Date: Wed Sep 20 14:23:07 2017
New Revision: 313820
URL: http://llvm.org/viewvc/llvm-project?rev=313820&view=rev
Log:
[NVPTX] Implemented shfl.sync instruction and supporting intrinsics/builtins.
Differential Revision: https://reviews.llvm.org/D38090
Added:
cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu
Modified:
cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def
cfe/trunk/lib/Driver/ToolChains/Cuda.cpp
cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h
cfe/trunk/test/CodeGen/builtins-nvptx.c
Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def?rev=313820&r1=313819&r2=313820&view=diff
==============================================================================
--- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def (original)
+++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def Wed Sep 20 14:23:07 2017
@@ -390,6 +390,15 @@ BUILTIN(__nvvm_shfl_bfly_f32, "ffii", ""
BUILTIN(__nvvm_shfl_idx_i32, "iiii", "")
BUILTIN(__nvvm_shfl_idx_f32, "ffii", "")
+TARGET_BUILTIN(__nvvm_shfl_sync_down_i32, "iUiiii", "", "ptx60")
+TARGET_BUILTIN(__nvvm_shfl_sync_down_f32, "fUifii", "", "ptx60")
+TARGET_BUILTIN(__nvvm_shfl_sync_up_i32, "iUiiii", "", "ptx60")
+TARGET_BUILTIN(__nvvm_shfl_sync_up_f32, "fUifii", "", "ptx60")
+TARGET_BUILTIN(__nvvm_shfl_sync_bfly_i32, "iUiiii", "", "ptx60")
+TARGET_BUILTIN(__nvvm_shfl_sync_bfly_f32, "fUifii", "", "ptx60")
+TARGET_BUILTIN(__nvvm_shfl_sync_idx_i32, "iUiiii", "", "ptx60")
+TARGET_BUILTIN(__nvvm_shfl_sync_idx_f32, "fUifii", "", "ptx60")
+
// Membar
BUILTIN(__nvvm_membar_cta, "v", "")
Modified: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/Cuda.cpp?rev=313820&r1=313819&r2=313820&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/ToolChains/Cuda.cpp (original)
+++ cfe/trunk/lib/Driver/ToolChains/Cuda.cpp Wed Sep 20 14:23:07 2017
@@ -507,11 +507,17 @@ void CudaToolChain::addClangTargetOption
CC1Args.push_back("-mlink-cuda-bitcode");
CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
- // Libdevice in CUDA-7.0 requires PTX version that's more recent
- // than LLVM defaults to. Use PTX4.2 which is the PTX version that
- // came with CUDA-7.0.
- CC1Args.push_back("-target-feature");
- CC1Args.push_back("+ptx42");
+ if (CudaInstallation.version() >= CudaVersion::CUDA_90) {
+ // CUDA-9 uses new instructions that are only available in PTX6.0
+ CC1Args.push_back("-target-feature");
+ CC1Args.push_back("+ptx60");
+ } else {
+ // Libdevice in CUDA-7.0 requires PTX version that's more recent
+ // than LLVM defaults to. Use PTX4.2 which is the PTX version that
+ // came with CUDA-7.0.
+ CC1Args.push_back("-target-feature");
+ CC1Args.push_back("+ptx42");
+ }
}
void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h?rev=313820&r1=313819&r2=313820&view=diff
==============================================================================
--- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h (original)
+++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Wed Sep 20 14:23:07 2017
@@ -92,6 +92,74 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+// __shfl_sync_* variants available in CUDA-9
+#if CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
+#pragma push_macro("__MAKE_SYNC_SHUFFLES")
+#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \
+ __Mask) \
+ inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \
+ int __width = warpSize) { \
+ return __IntIntrinsic(__mask, __val, __offset, \
+ ((warpSize - __width) << 8) | (__Mask)); \
+ } \
+ inline __device__ float __FnName(unsigned int __mask, float __val, \
+ int __offset, int __width = warpSize) { \
+ return __FloatIntrinsic(__mask, __val, __offset, \
+ ((warpSize - __width) << 8) | (__Mask)); \
+ } \
+ inline __device__ unsigned int __FnName(unsigned int __mask, \
+ unsigned int __val, int __offset, \
+ int __width = warpSize) { \
+ return static_cast<unsigned int>( \
+ ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
+ } \
+ inline __device__ long long __FnName(unsigned int __mask, long long __val, \
+ int __offset, int __width = warpSize) { \
+ struct __Bits { \
+ int __a, __b; \
+ }; \
+ _Static_assert(sizeof(__val) == sizeof(__Bits)); \
+ _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
+ __Bits __tmp; \
+ memcpy(&__val, &__tmp, sizeof(__val)); \
+ __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \
+ __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \
+ long long __ret; \
+ memcpy(&__ret, &__tmp, sizeof(__tmp)); \
+ return __ret; \
+ } \
+ inline __device__ unsigned long long __FnName( \
+ unsigned int __mask, unsigned long long __val, int __offset, \
+ int __width = warpSize) { \
+ return static_cast<unsigned long long>(::__FnName( \
+ __mask, static_cast<unsigned long long>(__val), __offset, __width)); \
+ } \
+ inline __device__ double __FnName(unsigned int __mask, double __val, \
+ int __offset, int __width = warpSize) { \
+ long long __tmp; \
+ _Static_assert(sizeof(__tmp) == sizeof(__val)); \
+ memcpy(&__tmp, &__val, sizeof(__val)); \
+ __tmp = ::__FnName(__mask, __tmp, __offset, __width); \
+ double __ret; \
+ memcpy(&__ret, &__tmp, sizeof(__ret)); \
+ return __ret; \
+ }
+__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
+ __nvvm_shfl_sync_idx_f32, 0x1f);
+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
+// maxLane.
+__MAKE_SYNC_SHUFFLES(__shfl_sync_up, __nvvm_shfl_sync_up_i32,
+ __nvvm_shfl_sync_up_f32, 0);
+__MAKE_SYNC_SHUFFLES(__shfl_sync_down, __nvvm_shfl_sync_down_i32,
+ __nvvm_shfl_sync_down_f32, 0x1f);
+__MAKE_SYNC_SHUFFLES(__shfl_sync_xor, __nvvm_shfl_sync_bfly_i32,
+ __nvvm_shfl_sync_bfly_f32, 0x1f);
+
+#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
+
+#endif // __CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) ||
+ // __CUDA_ARCH__ >= 300)
+
// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
// Prevent the vanilla sm_32 intrinsics header from being included.
Added: cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu?rev=313820&view=auto
==============================================================================
--- cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu (added)
+++ cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu Wed Sep 20 14:23:07 2017
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_60 \
+// RUN: -fcuda-is-device -target-feature +ptx60 \
+// RUN: -S -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefix=CHECK %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \
+// RUN: -fcuda-is-device -S -o /dev/null -x cuda -verify %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+
+// CHECK-LABEL: nvvm_shfl_sync
+__device__ void nvvm_shfl_sync(unsigned mask, int i, float f, int a, int b) {
+ // CHECK: call i32 @llvm.nvvm.shfl.sync.down.i32(i32 {{%[0-9]+}}, i32
+ // expected-error at +1 {{'__nvvm_shfl_sync_down_i32' needs target feature ptx60}}
+ __nvvm_shfl_sync_down_i32(mask, i, a, b);
+ // CHECK: call float @llvm.nvvm.shfl.sync.down.f32(i32 {{%[0-9]+}}, float
+ // expected-error at +1 {{'__nvvm_shfl_sync_down_f32' needs target feature ptx60}}
+ __nvvm_shfl_sync_down_f32(mask, f, a, b);
+ // CHECK: call i32 @llvm.nvvm.shfl.sync.up.i32(i32 {{%[0-9]+}}, i32
+ // expected-error at +1 {{'__nvvm_shfl_sync_up_i32' needs target feature ptx60}}
+ __nvvm_shfl_sync_up_i32(mask, i, a, b);
+ // CHECK: call float @llvm.nvvm.shfl.sync.up.f32(i32 {{%[0-9]+}}, float
+ // expected-error at +1 {{'__nvvm_shfl_sync_up_f32' needs target feature ptx60}}
+ __nvvm_shfl_sync_up_f32(mask, f, a, b);
+ // CHECK: call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 {{%[0-9]+}}, i32
+ // expected-error at +1 {{'__nvvm_shfl_sync_bfly_i32' needs target feature ptx60}}
+ __nvvm_shfl_sync_bfly_i32(mask, i, a, b);
+ // CHECK: call float @llvm.nvvm.shfl.sync.bfly.f32(i32 {{%[0-9]+}}, float
+ // expected-error at +1 {{'__nvvm_shfl_sync_bfly_f32' needs target feature ptx60}}
+ __nvvm_shfl_sync_bfly_f32(mask, f, a, b);
+ // CHECK: call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 {{%[0-9]+}}, i32
+ // expected-error at +1 {{'__nvvm_shfl_sync_idx_i32' needs target feature ptx60}}
+ __nvvm_shfl_sync_idx_i32(mask, i, a, b);
+ // CHECK: call float @llvm.nvvm.shfl.sync.idx.f32(i32 {{%[0-9]+}}, float
+ // expected-error at +1 {{'__nvvm_shfl_sync_idx_f32' needs target feature ptx60}}
+ __nvvm_shfl_sync_idx_f32(mask, f, a, b);
+ // CHECK: ret void
+}
Modified: cfe/trunk/test/CodeGen/builtins-nvptx.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/builtins-nvptx.c?rev=313820&r1=313819&r2=313820&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/builtins-nvptx.c (original)
+++ cfe/trunk/test/CodeGen/builtins-nvptx.c Wed Sep 20 14:23:07 2017
@@ -636,3 +636,24 @@ __device__ void nvvm_ldg(const void *p)
typedef double double2 __attribute__((ext_vector_type(2)));
__nvvm_ldg_d2((const double2 *)p);
}
+
+// CHECK-LABEL: nvvm_shfl
+__device__ void nvvm_shfl(int i, float f, int a, int b) {
+ // CHECK: call i32 @llvm.nvvm.shfl.down.i32(i32
+ __nvvm_shfl_down_i32(i, a, b);
+ // CHECK: call float @llvm.nvvm.shfl.down.f32(float
+ __nvvm_shfl_down_f32(f, a, b);
+ // CHECK: call i32 @llvm.nvvm.shfl.up.i32(i32
+ __nvvm_shfl_up_i32(i, a, b);
+ // CHECK: call float @llvm.nvvm.shfl.up.f32(float
+ __nvvm_shfl_up_f32(f, a, b);
+ // CHECK: call i32 @llvm.nvvm.shfl.bfly.i32(i32
+ __nvvm_shfl_bfly_i32(i, a, b);
+ // CHECK: call float @llvm.nvvm.shfl.bfly.f32(float
+ __nvvm_shfl_bfly_f32(f, a, b);
+ // CHECK: call i32 @llvm.nvvm.shfl.idx.i32(i32
+ __nvvm_shfl_idx_i32(i, a, b);
+ // CHECK: call float @llvm.nvvm.shfl.idx.f32(float
+ __nvvm_shfl_idx_f32(f, a, b);
+ // CHECK: ret void
+}
More information about the cfe-commits
mailing list