[clang] [libc][nfc] Use common implementation of read_first_lane_u64 (PR #131027)
Jon Chesterfield via cfe-commits
cfe-commits at lists.llvm.org
Wed Mar 12 14:01:44 PDT 2025
https://github.com/JonChesterfield updated https://github.com/llvm/llvm-project/pull/131027
>From 68f09d0f3f7849b91cb39ce42ba48e3e4aafb488 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfield at gmail.com>
Date: Wed, 12 Mar 2025 20:31:39 +0000
Subject: [PATCH] [libc][nfc] Use common implementation of read_first_lane_u64,
no codegen regression
---
clang/lib/Headers/amdgpuintrin.h | 15 ++----
clang/lib/Headers/gpuintrin.h | 10 ++++
clang/lib/Headers/nvptxintrin.h | 21 ++------
clang/test/Headers/gpuintrin.c | 87 +++++++++++++++++++++++++++++---
4 files changed, 99 insertions(+), 34 deletions(-)
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 15409eacf7716..839a05175cf3e 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -33,6 +33,10 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
// Attribute to declare a function as a kernel.
#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
+// Defined in gpuintrin.h, used later in this file.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
// Returns the number of workgroups in the 'x' dimension of the grid.
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -115,15 +119,6 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
return __builtin_amdgcn_readfirstlane(__x);
}
-// Copies the value from the first active thread in the wavefront to the rest.
-_DEFAULT_FN_ATTRS __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
- uint32_t __hi = (uint32_t)(__x >> 32ull);
- uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
- return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
- ((uint64_t)__builtin_amdgcn_readfirstlane(__lo) & 0xFFFFFFFF);
-}
-
// Returns a bitmask of threads in the current lane for which \p x is true.
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
bool __x) {
@@ -203,7 +198,7 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
- uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
__gpu_sync_lane(__lane_mask);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index efdc3d94ac0b3..4181628d18048 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -115,6 +115,16 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
}
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint32_t __hi = (uint32_t)(__x >> 32ull);
+ uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);
+ return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
+ ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) &
+ 0xFFFFFFFFull);
+}
+
// Gets the first floating point value from the active lanes.
_DEFAULT_FN_ATTRS static __inline__ float
__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 73eb0af8b5926..d00a5f6de3950 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -37,6 +37,10 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
// Attribute to declare a function as a kernel.
#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
+// Defined in gpuintrin.h, used later in this file.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
// Returns the number of CUDA blocks in the 'x' dimension.
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
return __nvvm_read_ptx_sreg_nctaid_x();
@@ -120,21 +124,6 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
}
-// Copies the value from the first active thread in the warp to the rest.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
- uint32_t __hi = (uint32_t)(__x >> 32ull);
- uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
- uint32_t __mask = (uint32_t)__lane_mask;
- uint32_t __id = __builtin_ffs(__mask) - 1;
- return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __id,
- __gpu_num_lanes() - 1)
- << 32ull) |
- ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __id,
- __gpu_num_lanes() - 1) &
- 0xFFFFFFFF);
-}
-
// Returns a bitmask of threads in the current lane for which \p x is true.
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
bool __x) {
@@ -231,7 +220,7 @@ __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
#endif
- uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 30aa6f147ba03..9a15ce277ba87 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -33,6 +33,7 @@ __gpu_kernel void foo() {
__gpu_lane_id();
__gpu_lane_mask();
__gpu_read_first_lane_u32(-1, -1);
+ __gpu_read_first_lane_u64(-1, -1);
__gpu_ballot(-1, 1);
__gpu_sync_threads();
__gpu_sync_lane(-1);
@@ -64,12 +65,13 @@ __gpu_kernel void foo() {
// AMDGPU-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]]
// AMDGPU-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR7]]
// AMDGPU-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
+// AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT: [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_sync_threads() #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
-// AMDGPU-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT: [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
+// AMDGPU-NEXT: [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT: [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR8:[0-9]+]]
// AMDGPU-NEXT: unreachable
//
@@ -388,6 +390,43 @@ __gpu_kernel void foo() {
// AMDGPU-NEXT: ret i32 [[TMP1]]
//
//
+// AMDGPU-LABEL: define internal i64 @__gpu_read_first_lane_u64(
+// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
+// AMDGPU-NEXT: [[ENTRY:.*:]]
+// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT: [[__X_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT: [[__HI:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[__LO:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
+// AMDGPU-NEXT: [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr
+// AMDGPU-NEXT: [[__HI_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__HI]] to ptr
+// AMDGPU-NEXT: [[__LO_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LO]] to ptr
+// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT: store i64 [[__X]], ptr [[__X_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT: [[SHR:%.*]] = lshr i64 [[TMP0]], 32
+// AMDGPU-NEXT: [[CONV:%.*]] = trunc i64 [[SHR]] to i32
+// AMDGPU-NEXT: store i32 [[CONV]], ptr [[__HI_ASCAST]], align 4
+// AMDGPU-NEXT: [[TMP1:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT: [[AND:%.*]] = and i64 [[TMP1]], 4294967295
+// AMDGPU-NEXT: [[CONV1:%.*]] = trunc i64 [[AND]] to i32
+// AMDGPU-NEXT: store i32 [[CONV1]], ptr [[__LO_ASCAST]], align 4
+// AMDGPU-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[__HI_ASCAST]], align 4
+// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR7]]
+// AMDGPU-NEXT: [[CONV2:%.*]] = zext i32 [[CALL]] to i64
+// AMDGPU-NEXT: [[SHL:%.*]] = shl i64 [[CONV2]], 32
+// AMDGPU-NEXT: [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[__LO_ASCAST]], align 4
+// AMDGPU-NEXT: [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR7]]
+// AMDGPU-NEXT: [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
+// AMDGPU-NEXT: [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
+// AMDGPU-NEXT: [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
+// AMDGPU-NEXT: ret i64 [[OR]]
+//
+//
// AMDGPU-LABEL: define internal i64 @__gpu_ballot(
// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
@@ -525,12 +564,13 @@ __gpu_kernel void foo() {
// NVPTX-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]]
// NVPTX-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR6]]
// NVPTX-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
+// NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT: [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_sync_threads() #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
-// NVPTX-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT: [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
+// NVPTX-NEXT: [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT: [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]]
// NVPTX-NEXT: unreachable
//
@@ -793,6 +833,37 @@ __gpu_kernel void foo() {
// NVPTX-NEXT: ret i32 [[TMP7]]
//
//
+// NVPTX-LABEL: define internal i64 @__gpu_read_first_lane_u64(
+// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
+// NVPTX-NEXT: [[ENTRY:.*:]]
+// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// NVPTX-NEXT: [[__X_ADDR:%.*]] = alloca i64, align 8
+// NVPTX-NEXT: [[__HI:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: [[__LO:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// NVPTX-NEXT: store i64 [[__X]], ptr [[__X_ADDR]], align 8
+// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// NVPTX-NEXT: [[SHR:%.*]] = lshr i64 [[TMP0]], 32
+// NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[SHR]] to i32
+// NVPTX-NEXT: store i32 [[CONV]], ptr [[__HI]], align 4
+// NVPTX-NEXT: [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// NVPTX-NEXT: [[AND:%.*]] = and i64 [[TMP1]], 4294967295
+// NVPTX-NEXT: [[CONV1:%.*]] = trunc i64 [[AND]] to i32
+// NVPTX-NEXT: store i32 [[CONV1]], ptr [[__LO]], align 4
+// NVPTX-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4
+// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR6]]
+// NVPTX-NEXT: [[CONV2:%.*]] = zext i32 [[CALL]] to i64
+// NVPTX-NEXT: [[SHL:%.*]] = shl i64 [[CONV2]], 32
+// NVPTX-NEXT: [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// NVPTX-NEXT: [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4
+// NVPTX-NEXT: [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR6]]
+// NVPTX-NEXT: [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
+// NVPTX-NEXT: [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
+// NVPTX-NEXT: [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
+// NVPTX-NEXT: ret i64 [[OR]]
+//
+//
// NVPTX-LABEL: define internal i64 @__gpu_ballot(
// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
// NVPTX-NEXT: [[ENTRY:.*:]]
More information about the cfe-commits
mailing list