[llvm-branch-commits] [clang] [libc] release/20.x: [Clang] Fix test after new argument was added (PR #125912)

Wed Feb 5 11:10:48 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: None (llvmbot)

<details>
<summary>Changes</summary>

Backport 2d8106cb5a505326d1da0f4461708ed44a0ac761 718cdeb9c701725412a040b2b7148523a286a256

Requested by: @jhuber6

---
Full diff: https://github.com/llvm/llvm-project/pull/125912.diff


7 Files Affected:

- (modified) clang/lib/Headers/amdgpuintrin.h (+9-5) 
- (modified) clang/lib/Headers/gpuintrin.h (+14-10) 
- (modified) clang/lib/Headers/nvptxintrin.h (+8-7) 
- (modified) clang/test/Headers/gpuintrin.c (+65-3) 
- (modified) libc/src/__support/GPU/utils.h (+3-2) 
- (modified) libc/test/integration/src/__support/GPU/CMakeLists.txt (+9) 
- (added) libc/test/integration/src/__support/GPU/shuffle.cpp (+33) 


``````````diff

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 038605605462f8..9dad99ffe9439a 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -145,17 +145,21 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
 
 // Shuffles the the lanes inside the wavefront according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
-  return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+  return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
 }
 
 // Shuffles the the lanes inside the wavefront according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+                      uint32_t __width) {
   uint32_t __hi = (uint32_t)(__x >> 32ull);
   uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
-  return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
-         ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
+  return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width)
+          << 32ull) |
+         ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
 }
 
 // Returns true if the flat pointer points to AMDGPU 'shared' memory.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4c463c333308fc..11c87e85cd4975 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -133,18 +133,21 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
 
 // Shuffles the the lanes according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ float
-__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
+__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
+                      uint32_t __width) {
   return __builtin_bit_cast(
       float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
-                                   __builtin_bit_cast(uint32_t, __x)));
+                                   __builtin_bit_cast(uint32_t, __x), __width));
 }
 
 // Shuffles the the lanes according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ double
-__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
+__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
+                      uint32_t __width) {
   return __builtin_bit_cast(
-      double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
-                                    __builtin_bit_cast(uint64_t, __x)));
+      double,
+      __gpu_shuffle_idx_u64(__lane_mask, __idx,
+                            __builtin_bit_cast(uint64_t, __x), __width));
 }
 
 // Gets the sum of all lanes inside the warp or wavefront.
@@ -153,7 +156,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
       uint64_t __lane_mask, __type __x) {                                      \
     for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) {   \
       uint32_t __index = __step + __gpu_lane_id();                             \
-      __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x);          \
+      __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,           \
+                                          __gpu_num_lanes());                  \
     }                                                                          \
     return __gpu_read_first_lane_##__suffix(__lane_mask, __x);                 \
   }
@@ -171,10 +175,10 @@ __DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
       uint32_t __index = __gpu_lane_id() - __step;                             \
       __bitmask_type bitmask = __gpu_lane_id() >= __step;                      \
       __x += __builtin_bit_cast(                                               \
-          __type,                                                              \
-          -bitmask & __builtin_bit_cast(__bitmask_type,                        \
-                                        __gpu_shuffle_idx_##__suffix(          \
-                                            __lane_mask, __index, __x)));      \
+          __type, -bitmask & __builtin_bit_cast(__bitmask_type,                \
+                                                __gpu_shuffle_idx_##__suffix(  \
+                                                    __lane_mask, __index, __x, \
+                                                    __gpu_num_lanes())));      \
     }                                                                          \
     return __x;                                                                \
   }
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index fb2864eab6a09d..40fa2edebe975c 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
 
 // Shuffles the the lanes inside the warp according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
   uint32_t __mask = (uint32_t)__lane_mask;
-  return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
+  return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+                                  ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
 }
 
 // Shuffles the the lanes inside the warp according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+                      uint32_t __width) {
   uint32_t __hi = (uint32_t)(__x >> 32ull);
   uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
   uint32_t __mask = (uint32_t)__lane_mask;
-  return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx,
-                                             __gpu_num_lanes() - 1u)
+  return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
           << 32ull) |
-         ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
-                                             __gpu_num_lanes() - 1u));
+         ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
 }
 
 // Returns true if the flat pointer points to CUDA 'shared' memory.
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 281339716c3edf..89efe12ee8def8 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -38,7 +38,7 @@
 // AMDGPU-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
 // AMDGPU-NEXT:    call void @__gpu_sync_threads() #[[ATTR7]]
 // AMDGPU-NEXT:    call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
 // AMDGPU-NEXT:    [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
 // AMDGPU-NEXT:    [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
 // AMDGPU-NEXT:    call void @__gpu_exit() #[[ATTR8:[0-9]+]]
@@ -70,7 +70,7 @@
 // NVPTX-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
 // NVPTX-NEXT:    call void @__gpu_sync_threads() #[[ATTR6]]
 // NVPTX-NEXT:    call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
 // NVPTX-NEXT:    [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
 // NVPTX-NEXT:    [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
 // NVPTX-NEXT:    call void @__gpu_exit() #[[ATTR7:[0-9]+]]
@@ -90,6 +90,68 @@ __gpu_kernel void foo() {
   __gpu_num_threads_z();
   __gpu_num_threads(0);
   __gpu_thread_id_x();
+// AMDGPU-LABEL: define internal i32 @__gpu_thread_id(
+// AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGPU-NEXT:    [[__DIM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__DIM_ADDR]] to ptr
+// AMDGPU-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// AMDGPU-NEXT:      i32 0, label %[[SW_BB:.*]]
+// AMDGPU-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// AMDGPU-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// AMDGPU-NEXT:    ]
+// AMDGPU:       [[SW_BB]]:
+// AMDGPU-NEXT:    [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR7]]
+// AMDGPU-NEXT:    store i32 [[CALL]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT:    br label %[[RETURN:.*]]
+// AMDGPU:       [[SW_BB1]]:
+// AMDGPU-NEXT:    [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR7]]
+// AMDGPU-NEXT:    store i32 [[CALL2]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT:    br label %[[RETURN]]
+// AMDGPU:       [[SW_BB3]]:
+// AMDGPU-NEXT:    [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR7]]
+// AMDGPU-NEXT:    store i32 [[CALL4]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT:    br label %[[RETURN]]
+// AMDGPU:       [[SW_DEFAULT]]:
+// AMDGPU-NEXT:    unreachable
+// AMDGPU:       [[RETURN]]:
+// AMDGPU-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT:    ret i32 [[TMP1]]
+//
+// NVPTX-LABEL: define internal i32 @__gpu_thread_id(
+// NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// NVPTX-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// NVPTX-NEXT:      i32 0, label %[[SW_BB:.*]]
+// NVPTX-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// NVPTX-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// NVPTX-NEXT:    ]
+// NVPTX:       [[SW_BB]]:
+// NVPTX-NEXT:    [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR6]]
+// NVPTX-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT:    br label %[[RETURN:.*]]
+// NVPTX:       [[SW_BB1]]:
+// NVPTX-NEXT:    [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR6]]
+// NVPTX-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT:    br label %[[RETURN]]
+// NVPTX:       [[SW_BB3]]:
+// NVPTX-NEXT:    [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR6]]
+// NVPTX-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT:    br label %[[RETURN]]
+// NVPTX:       [[SW_DEFAULT]]:
+// NVPTX-NEXT:    unreachable
+// NVPTX:       [[RETURN]]:
+// NVPTX-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// NVPTX-NEXT:    ret i32 [[TMP1]]
+//
   __gpu_thread_id_y();
   __gpu_thread_id_z();
   __gpu_thread_id(0);
@@ -100,7 +162,7 @@ __gpu_kernel void foo() {
   __gpu_ballot(-1, 1);
   __gpu_sync_threads();
   __gpu_sync_lane(-1);
-  __gpu_shuffle_idx_u32(-1, -1, -1);
+  __gpu_shuffle_idx_u32(-1, -1, -1, 0);
   __gpu_first_lane_id(-1);
   __gpu_is_first_in_lane(-1);
   __gpu_exit();
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index e138c84c0cb22d..323c003f1ff074 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -87,8 +87,9 @@ LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
 
 LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
 
-LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
-  return __gpu_shuffle_idx_u32(lane_mask, idx, x);
+LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
+                             uint32_t width = __gpu_num_lanes()) {
+  return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
 }
 
 [[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 7811e0da45ddcf..68bbc3849bc7ec 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -9,3 +9,12 @@ add_integration_test(
   LOADER_ARGS
     --threads 64
 )
+
+add_integration_test(
+  shuffle_test
+  SUITE libc-support-gpu-tests
+  SRCS
+    shuffle.cpp
+  LOADER_ARGS
+    --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/shuffle.cpp b/libc/test/integration/src/__support/GPU/shuffle.cpp
new file mode 100644
index 00000000000000..c346a2eb3f0c29
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/shuffle.cpp
@@ -0,0 +1,33 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to make sure the shuffle instruction works by doing a simple broadcast.
+// Each iteration reduces the width, so it will broadcast to a subset we check.
+static void test_shuffle() {
+  uint64_t mask = gpu::get_lane_mask();
+  EXPECT_EQ(cpp::popcount(mask), gpu::get_lane_size());
+
+  uint32_t x = gpu::get_lane_id();
+  for (uint32_t width = gpu::get_lane_size(); width > 0; width /= 2)
+    EXPECT_EQ(gpu::shuffle(mask, 0, x, width), (x / width) * width);
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+  if (gpu::get_thread_id() >= gpu::get_lane_size())
+    return 0;
+
+  test_shuffle();
+
+  return 0;
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/125912