[llvm-branch-commits] [clang] [libc] release/20.x: [Clang] Fix test after new argument was added (PR #125912)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Feb 5 11:10:48 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: None (llvmbot)
<details>
<summary>Changes</summary>
Backport 2d8106cb5a505326d1da0f4461708ed44a0ac761 718cdeb9c701725412a040b2b7148523a286a256
Requested by: @<!-- -->jhuber6
---
Full diff: https://github.com/llvm/llvm-project/pull/125912.diff
7 Files Affected:
- (modified) clang/lib/Headers/amdgpuintrin.h (+9-5)
- (modified) clang/lib/Headers/gpuintrin.h (+14-10)
- (modified) clang/lib/Headers/nvptxintrin.h (+8-7)
- (modified) clang/test/Headers/gpuintrin.c (+65-3)
- (modified) libc/src/__support/GPU/utils.h (+3-2)
- (modified) libc/test/integration/src/__support/GPU/CMakeLists.txt (+9)
- (added) libc/test/integration/src/__support/GPU/shuffle.cpp (+33)
``````````diff
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 038605605462f8..9dad99ffe9439a 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -145,17 +145,21 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
- return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
+ uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+ return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
}
// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+ uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
- return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
- ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
+ return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width)
+ << 32ull) |
+ ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4c463c333308fc..11c87e85cd4975 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -133,18 +133,21 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ float
-__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
+__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
+ uint32_t __width) {
return __builtin_bit_cast(
float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
- __builtin_bit_cast(uint32_t, __x)));
+ __builtin_bit_cast(uint32_t, __x), __width));
}
// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ double
-__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
+__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
+ uint32_t __width) {
return __builtin_bit_cast(
- double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
- __builtin_bit_cast(uint64_t, __x)));
+ double,
+ __gpu_shuffle_idx_u64(__lane_mask, __idx,
+ __builtin_bit_cast(uint64_t, __x), __width));
}
// Gets the sum of all lanes inside the warp or wavefront.
@@ -153,7 +156,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
uint64_t __lane_mask, __type __x) { \
for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
uint32_t __index = __step + __gpu_lane_id(); \
- __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
+ __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
+ __gpu_num_lanes()); \
} \
return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
}
@@ -171,10 +175,10 @@ __DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
uint32_t __index = __gpu_lane_id() - __step; \
__bitmask_type bitmask = __gpu_lane_id() >= __step; \
__x += __builtin_bit_cast( \
- __type, \
- -bitmask & __builtin_bit_cast(__bitmask_type, \
- __gpu_shuffle_idx_##__suffix( \
- __lane_mask, __index, __x))); \
+ __type, -bitmask & __builtin_bit_cast(__bitmask_type, \
+ __gpu_shuffle_idx_##__suffix( \
+ __lane_mask, __index, __x, \
+ __gpu_num_lanes()))); \
} \
return __x; \
}
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index fb2864eab6a09d..40fa2edebe975c 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
uint32_t __mask = (uint32_t)__lane_mask;
- return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
+ return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+ ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
}
// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+ uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
uint32_t __mask = (uint32_t)__lane_mask;
- return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx,
- __gpu_num_lanes() - 1u)
+ return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
<< 32ull) |
- ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
- __gpu_num_lanes() - 1u));
+ ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
// Returns true if the flat pointer points to CUDA 'shared' memory.
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 281339716c3edf..89efe12ee8def8 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -38,7 +38,7 @@
// AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_sync_threads() #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
// AMDGPU-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
// AMDGPU-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR8:[0-9]+]]
@@ -70,7 +70,7 @@
// NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_sync_threads() #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
// NVPTX-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
// NVPTX-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]]
@@ -90,6 +90,68 @@ __gpu_kernel void foo() {
__gpu_num_threads_z();
__gpu_num_threads(0);
__gpu_thread_id_x();
+// AMDGPU-LABEL: define internal i32 @__gpu_thread_id(
+// AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// AMDGPU-NEXT: [[ENTRY:.*:]]
+// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGPU-NEXT: [[__DIM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__DIM_ADDR]] to ptr
+// AMDGPU-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// AMDGPU-NEXT: i32 0, label %[[SW_BB:.*]]
+// AMDGPU-NEXT: i32 1, label %[[SW_BB1:.*]]
+// AMDGPU-NEXT: i32 2, label %[[SW_BB3:.*]]
+// AMDGPU-NEXT: ]
+// AMDGPU: [[SW_BB]]:
+// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR7]]
+// AMDGPU-NEXT: store i32 [[CALL]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[RETURN:.*]]
+// AMDGPU: [[SW_BB1]]:
+// AMDGPU-NEXT: [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR7]]
+// AMDGPU-NEXT: store i32 [[CALL2]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[RETURN]]
+// AMDGPU: [[SW_BB3]]:
+// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR7]]
+// AMDGPU-NEXT: store i32 [[CALL4]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[RETURN]]
+// AMDGPU: [[SW_DEFAULT]]:
+// AMDGPU-NEXT: unreachable
+// AMDGPU: [[RETURN]]:
+// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: ret i32 [[TMP1]]
+//
+// NVPTX-LABEL: define internal i32 @__gpu_thread_id(
+// NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// NVPTX-NEXT: [[ENTRY:.*:]]
+// NVPTX-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// NVPTX-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// NVPTX-NEXT: i32 0, label %[[SW_BB:.*]]
+// NVPTX-NEXT: i32 1, label %[[SW_BB1:.*]]
+// NVPTX-NEXT: i32 2, label %[[SW_BB3:.*]]
+// NVPTX-NEXT: ]
+// NVPTX: [[SW_BB]]:
+// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR6]]
+// NVPTX-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT: br label %[[RETURN:.*]]
+// NVPTX: [[SW_BB1]]:
+// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR6]]
+// NVPTX-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT: br label %[[RETURN]]
+// NVPTX: [[SW_BB3]]:
+// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR6]]
+// NVPTX-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT: br label %[[RETURN]]
+// NVPTX: [[SW_DEFAULT]]:
+// NVPTX-NEXT: unreachable
+// NVPTX: [[RETURN]]:
+// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// NVPTX-NEXT: ret i32 [[TMP1]]
+//
__gpu_thread_id_y();
__gpu_thread_id_z();
__gpu_thread_id(0);
@@ -100,7 +162,7 @@ __gpu_kernel void foo() {
__gpu_ballot(-1, 1);
__gpu_sync_threads();
__gpu_sync_lane(-1);
- __gpu_shuffle_idx_u32(-1, -1, -1);
+ __gpu_shuffle_idx_u32(-1, -1, -1, 0);
__gpu_first_lane_id(-1);
__gpu_is_first_in_lane(-1);
__gpu_exit();
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index e138c84c0cb22d..323c003f1ff074 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -87,8 +87,9 @@ LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
-LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
- return __gpu_shuffle_idx_u32(lane_mask, idx, x);
+LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
+ uint32_t width = __gpu_num_lanes()) {
+ return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 7811e0da45ddcf..68bbc3849bc7ec 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -9,3 +9,12 @@ add_integration_test(
LOADER_ARGS
--threads 64
)
+
+add_integration_test(
+ shuffle_test
+ SUITE libc-support-gpu-tests
+ SRCS
+ shuffle.cpp
+ LOADER_ARGS
+ --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/shuffle.cpp b/libc/test/integration/src/__support/GPU/shuffle.cpp
new file mode 100644
index 00000000000000..c346a2eb3f0c29
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/shuffle.cpp
@@ -0,0 +1,33 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to make sure the shuffle instruction works by doing a simple broadcast.
+// Each iteration reduces the width, so it will broadcast to a subset we check.
+static void test_shuffle() {
+ uint64_t mask = gpu::get_lane_mask();
+ EXPECT_EQ(cpp::popcount(mask), gpu::get_lane_size());
+
+ uint32_t x = gpu::get_lane_id();
+ for (uint32_t width = gpu::get_lane_size(); width > 0; width /= 2)
+ EXPECT_EQ(gpu::shuffle(mask, 0, x, width), (x / width) * width);
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+ if (gpu::get_thread_id() >= gpu::get_lane_size())
+ return 0;
+
+ test_shuffle();
+
+ return 0;
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/125912
More information about the llvm-branch-commits
mailing list