[clang] 48d6f52 - [CUDA][FIX] Make shfl[_sync] for unsigned long long non-recursive

Johannes Doerfert via cfe-commits cfe-commits at lists.llvm.org
Thu Jul 21 10:41:16 PDT 2022


Author: Johannes Doerfert
Date: 2022-07-21T12:36:54-05:00
New Revision: 48d6f5240187573881f96cc9574ea09592f50723

URL: https://github.com/llvm/llvm-project/commit/48d6f5240187573881f96cc9574ea09592f50723
DIFF: https://github.com/llvm/llvm-project/commit/48d6f5240187573881f96cc9574ea09592f50723.diff

LOG: [CUDA][FIX] Make shfl[_sync] for unsigned long long non-recursive

A copy-paste error caused UB in the definition of the unsigned long long
versions of the shfl intrinsics. Reported and diagnosed by @trws.

Differential Revision: https://reviews.llvm.org/D129536

Added: 
    clang/test/CodeGenCUDA/shuffle_long_long.cu

Modified: 
    clang/lib/Headers/__clang_cuda_intrinsics.h

Removed: 
    


################################################################################
diff  --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h
index cfd5eb869e340..b87413e12a272 100644
--- a/clang/lib/Headers/__clang_cuda_intrinsics.h
+++ b/clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -71,8 +71,8 @@
   }                                                                            \
   inline __device__ unsigned long long __FnName(                               \
       unsigned long long __val, __Type __offset, int __width = warpSize) {     \
-    return static_cast<unsigned long long>(::__FnName(                         \
-        static_cast<unsigned long long>(__val), __offset, __width));           \
+    return static_cast<unsigned long long>(                                    \
+        ::__FnName(static_cast<long long>(__val), __offset, __width));         \
   }                                                                            \
   inline __device__ double __FnName(double __val, __Type __offset,             \
                                     int __width = warpSize) {                  \
@@ -139,8 +139,8 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
   inline __device__ unsigned long long __FnName(                               \
       unsigned int __mask, unsigned long long __val, __Type __offset,          \
       int __width = warpSize) {                                                \
-    return static_cast<unsigned long long>(::__FnName(                         \
-        __mask, static_cast<unsigned long long>(__val), __offset, __width));   \
+    return static_cast<unsigned long long>(                                    \
+        ::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \
   }                                                                            \
   inline __device__ long __FnName(unsigned int __mask, long __val,             \
                                   __Type __offset, int __width = warpSize) {   \

diff  --git a/clang/test/CodeGenCUDA/shuffle_long_long.cu b/clang/test/CodeGenCUDA/shuffle_long_long.cu
new file mode 100644
index 0000000000000..c38b082832322
--- /dev/null
+++ b/clang/test/CodeGenCUDA/shuffle_long_long.cu
@@ -0,0 +1,61 @@
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm -target-cpu sm_30 %s -o - | FileCheck %s --check-prefix=NO_SYNC
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm -target-cpu sm_30 -target-feature +ptx70 -DSYNC -DCUDA_VERSION=9000 %s -o - | FileCheck %s --check-prefix=SYNC
+
+#include "Inputs/cuda.h"
+
+__device__ void *memcpy(void *dest, const void *src, size_t n);
+
+#define warpSize 32
+#include <__clang_cuda_intrinsics.h>
+
+__device__ void use(unsigned long long, long long);
+
+// Test function, 4 shfl calls.
+// NO_SYNC: define{{.*}} @_Z14test_long_longv
+// NO_SYNC:     call noundef i64 @_Z6__shflyii(
+// NO_SYNC:     call noundef i64 @_Z6__shflxii(
+
+// SYNC: define{{.*}} @_Z14test_long_longv
+// SYNC:        call noundef i64 @_Z11__shfl_syncjyii(
+// SYNC:        call noundef i64 @_Z11__shfl_syncjxii(
+
+// unsigned long long -> long long
+// NO_SYNC: define{{.*}} @_Z6__shflyii
+// NO_SYNC:     call noundef i64 @_Z6__shflxii(
+
+// long long -> int + int
+// NO_SYNC: define{{.*}} @_Z6__shflxii
+// NO_SYNC:     call noundef i32 @_Z6__shfliii(
+// NO_SYNC:     call noundef i32 @_Z6__shfliii(
+
+// NO_SYNC: define{{.*}} @_Z6__shfliii
+// NO_SYNC:   call i32 @llvm.nvvm.shfl.idx.i32
+
+// unsigned long long -> long long
+// SYNC: _Z11__shfl_syncjyii
+// SYNC:     call noundef i64 @_Z11__shfl_syncjxii(
+
+// long long -> int + int
+// SYNC: define{{.*}} @_Z11__shfl_syncjxii
+// SYNC:     call noundef i32 @_Z11__shfl_syncjiii(
+// SYNC:     call noundef i32 @_Z11__shfl_syncjiii(
+
+// SYNC: define{{.*}} @_Z11__shfl_syncjiii
+// SYNC:      call i32 @llvm.nvvm.shfl.sync.idx.i32
+
+__device__ void test_long_long() {
+  unsigned long long ull = 13;
+  long long ll = 17;
+#ifndef SYNC
+  ull = __shfl(ull, 7, 32);
+  ll = __shfl(ll, 7, 32);
+  use(ull, ll);
+#else
+  ull = __shfl_sync(0x11, ull, 7, 32);
+  ll = __shfl_sync(0x11, ll, 7, 32);
+  use(ull, ll);
+#endif
+}
+


        


More information about the cfe-commits mailing list