[clang] a89bb62 - [Clang] Update the 'gpuintrin.h' lane scan handling (#185451)
via cfe-commits
cfe-commits at lists.llvm.org
Mon Mar 9 13:50:57 PDT 2026
Author: Joseph Huber
Date: 2026-03-09T15:50:51-05:00
New Revision: a89bb6291dd9cbc401f291b4b4c15a71f9053eb0
URL: https://github.com/llvm/llvm-project/commit/a89bb6291dd9cbc401f291b4b4c15a71f9053eb0
DIFF: https://github.com/llvm/llvm-project/commit/a89bb6291dd9cbc401f291b4b4c15a71f9053eb0.diff
LOG: [Clang] Update the 'gpuintrin.h' lane scan handling (#185451)
Summary:
This patch uses a more efficient algorithm for the reduction rather than
a divergent branch. We also provide a prefix and suffix version, the sum
is now just the first element of this.
This changes the name to this, which is technically breaking but I don't
think these were really used in practice and it's a trivial change based
on the clang version if it's really needed..
```
__gpu_prefix_scan_sum_u32(...)
__gpu_suffix_scan_sum_u32(...)
```
Added:
Modified:
clang/lib/Headers/gpuintrin.h
clang/test/Headers/gpuintrin.c
libc/src/__support/GPU/utils.h
Removed:
################################################################################
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4335ad8c83ddd..4f7eea0cf6188 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -201,67 +201,55 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
__builtin_bit_cast(uint64_t, __x), __width));
}
-// Gets the accumulator scan of the threads in the warp or wavefront.
-#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
- _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
- uint64_t __lane_mask, uint32_t __x) { \
- uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
- bool __divergent = __gpu_read_first_lane_##__suffix( \
- __lane_mask, __first & (__first + 1)); \
- if (__divergent) { \
- __type __accum = 0; \
- for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) { \
- __type __index = __builtin_ctzll(__mask); \
- __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
- __gpu_num_lanes()); \
- __x = __gpu_lane_id() == __index ? __accum + __tmp : __x; \
- __accum += __tmp; \
- } \
- } else { \
- for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
- uint32_t __index = __gpu_lane_id() - __step; \
- __bitmask_type bitmask = __gpu_lane_id() >= __step; \
- __x += __builtin_bit_cast( \
- __type, \
- -bitmask & __builtin_bit_cast(__bitmask_type, \
- __gpu_shuffle_idx_##__suffix( \
- __lane_mask, __index, __x, \
- __gpu_num_lanes()))); \
- } \
+// Implements scan and reduction operations across a GPU warp or wavefront.
+//
+// Both scans work by iterating log2(N) steps. The bitmask tracks the currently
+// unprocessed lanes, above or below the current lane in the case of a suffix or
+// prefix scan. Each iteration we shuffle in the unprocessed neighbors and then
+// clear the bits that this operation handled.
+#define __DO_LANE_OP(__type, __op, __identity, __prefix, __suffix) \
+ _DEFAULT_FN_ATTRS static __inline__ __type \
+ __gpu_suffix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
+ __type __x) { \
+ uint64_t __above = __lane_mask & -(2ull << __gpu_lane_id()); \
+ for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
+ uint32_t __src = __above ? __builtin_ctzg(__above) : __gpu_lane_id(); \
+ __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
+ __gpu_num_lanes()); \
+ __x = __x __op(__above ? __result : (__type)__identity); \
+ for (uint32_t __i = 0; __i < __step; ++__i) \
+ __above &= __above - 1; \
} \
return __x; \
- }
-__DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x)
-__DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x)
-__DO_LANE_SCAN(float, uint32_t, f32); // float __gpu_lane_scan_f32(m, x)
-__DO_LANE_SCAN(double, uint64_t, f64); // double __gpu_lane_scan_f64(m, x)
-#undef __DO_LANE_SCAN
-
-// Gets the sum of all lanes inside the warp or wavefront.
-#define __DO_LANE_SUM(__type, __suffix) \
- _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
- uint64_t __lane_mask, __type __x) { \
- uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
- bool __divergent = __gpu_read_first_lane_##__suffix( \
- __lane_mask, __first & (__first + 1)); \
- if (__divergent) { \
- return __gpu_shuffle_idx_##__suffix( \
- __lane_mask, 63 - __builtin_clzll(__lane_mask), \
- __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes()); \
- } else { \
- for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
- uint32_t __index = __step + __gpu_lane_id(); \
- __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
- __gpu_num_lanes()); \
- } \
- return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
+ } \
+ \
+ _DEFAULT_FN_ATTRS static __inline__ __type \
+ __gpu_prefix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
+ __type __x) { \
+ uint64_t __below = __lane_mask & ((1ull << __gpu_lane_id()) - 1); \
+ for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
+ uint32_t __src = \
+ __below ? (63 - __builtin_clzg(__below)) : __gpu_lane_id(); \
+ __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
+ __gpu_num_lanes()); \
+ __x = __x __op(__below ? __result : (__type)__identity); \
+ for (uint32_t __i = 0; __i < __step; ++__i) \
+ __below ^= (1ull << (63 - __builtin_clzg(__below, 0))) & __below; \
} \
+ return __x; \
+ } \
+ \
+ _DEFAULT_FN_ATTRS static __inline__ __type \
+ __gpu_lane_##__prefix##_##__suffix(uint64_t __lane_mask, __type __x) { \
+ return __gpu_read_first_lane_##__suffix( \
+ __lane_mask, \
+ __gpu_suffix_scan_##__prefix##_##__suffix(__lane_mask, __x)); \
}
-__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
-__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
-__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
-__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
-#undef __DO_LANE_SUM
+__DO_LANE_OP(uint32_t, +, 0, sum, u32);
+__DO_LANE_OP(uint64_t, +, 0, sum, u64);
+__DO_LANE_OP(float, +, 0, sum, f32);
+__DO_LANE_OP(double, +, 0, sum, f64);
+#undef __DO_LANE_OP
// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 17c1699ee5c36..04b50acc4a049 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -43,6 +43,7 @@ __gpu_kernel void foo() {
__gpu_shuffle_idx_u32(-1, -1, -1, 0);
__gpu_first_lane_id(-1);
__gpu_is_first_in_lane(-1);
+ __gpu_prefix_scan_sum_u32(~0, 1);
__gpu_exit();
}
// AMDGPU-LABEL: define protected amdgpu_kernel void @foo(
@@ -75,6 +76,7 @@ __gpu_kernel void foo() {
// AMDGPU-NEXT: [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR8]]
// AMDGPU-NEXT: [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR8]]
// AMDGPU-NEXT: [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR8]]
+// AMDGPU-NEXT: [[CALL25:%.*]] = call i32 @__gpu_prefix_scan_sum_u32(i64 noundef -1, i32 noundef 1) #[[ATTR8]]
// AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR9:[0-9]+]]
// AMDGPU-NEXT: unreachable
//
@@ -525,6 +527,113 @@ __gpu_kernel void foo() {
// AMDGPU-NEXT: ret i1 [[CMP]]
//
//
+// AMDGPU-LABEL: define internal i32 @__gpu_prefix_scan_sum_u32(
+// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__X:%.*]]) #[[ATTR0]] {
+// AMDGPU-NEXT: [[ENTRY:.*:]]
+// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT: [[__X_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[__BELOW:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT: [[__STEP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[__SRC:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[__RESULT:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[__I:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
+// AMDGPU-NEXT: [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr
+// AMDGPU-NEXT: [[__BELOW_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__BELOW]] to ptr
+// AMDGPU-NEXT: [[__STEP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__STEP]] to ptr
+// AMDGPU-NEXT: [[__SRC_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__SRC]] to ptr
+// AMDGPU-NEXT: [[__RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__RESULT]] to ptr
+// AMDGPU-NEXT: [[__I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__I]] to ptr
+// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT: store i32 [[__X]], ptr [[__X_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_lane_id() #[[ATTR8]]
+// AMDGPU-NEXT: [[SH_PROM:%.*]] = zext i32 [[CALL]] to i64
+// AMDGPU-NEXT: [[SHL:%.*]] = shl i64 1, [[SH_PROM]]
+// AMDGPU-NEXT: [[SUB:%.*]] = sub i64 [[SHL]], 1
+// AMDGPU-NEXT: [[AND:%.*]] = and i64 [[TMP0]], [[SUB]]
+// AMDGPU-NEXT: store i64 [[AND]], ptr [[__BELOW_ASCAST]], align 8
+// AMDGPU-NEXT: store i32 1, ptr [[__STEP_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[FOR_COND:.*]]
+// AMDGPU: [[FOR_COND]]:
+// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[__STEP_ASCAST]], align 4
+// AMDGPU-NEXT: [[CALL1:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR8]]
+// AMDGPU-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], [[CALL1]]
+// AMDGPU-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END20:.*]]
+// AMDGPU: [[FOR_BODY]]:
+// AMDGPU-NEXT: [[TMP2:%.*]] = load i64, ptr [[__BELOW_ASCAST]], align 8
+// AMDGPU-NEXT: [[TOBOOL:%.*]] = icmp ne i64 [[TMP2]], 0
+// AMDGPU-NEXT: br i1 [[TOBOOL]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// AMDGPU: [[COND_TRUE]]:
+// AMDGPU-NEXT: [[TMP3:%.*]] = load i64, ptr [[__BELOW_ASCAST]], align 8
+// AMDGPU-NEXT: [[TMP4:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP3]], i1 true)
+// AMDGPU-NEXT: [[CAST:%.*]] = trunc i64 [[TMP4]] to i32
+// AMDGPU-NEXT: [[SUB2:%.*]] = sub nsw i32 63, [[CAST]]
+// AMDGPU-NEXT: br label %[[COND_END:.*]]
+// AMDGPU: [[COND_FALSE]]:
+// AMDGPU-NEXT: [[CALL3:%.*]] = call i32 @__gpu_lane_id() #[[ATTR8]]
+// AMDGPU-NEXT: br label %[[COND_END]]
+// AMDGPU: [[COND_END]]:
+// AMDGPU-NEXT: [[COND:%.*]] = phi i32 [ [[SUB2]], %[[COND_TRUE]] ], [ [[CALL3]], %[[COND_FALSE]] ]
+// AMDGPU-NEXT: store i32 [[COND]], ptr [[__SRC_ASCAST]], align 4
+// AMDGPU-NEXT: [[TMP5:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[__SRC_ASCAST]], align 4
+// AMDGPU-NEXT: [[TMP7:%.*]] = load i32, ptr [[__X_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR8]]
+// AMDGPU-NEXT: [[CALL5:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef [[TMP5]], i32 noundef [[TMP6]], i32 noundef [[TMP7]], i32 noundef [[CALL4]]) #[[ATTR8]]
+// AMDGPU-NEXT: store i32 [[CALL5]], ptr [[__RESULT_ASCAST]], align 4
+// AMDGPU-NEXT: [[TMP8:%.*]] = load i32, ptr [[__X_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT: [[TMP9:%.*]] = load i64, ptr [[__BELOW_ASCAST]], align 8
+// AMDGPU-NEXT: [[TOBOOL6:%.*]] = icmp ne i64 [[TMP9]], 0
+// AMDGPU-NEXT: br i1 [[TOBOOL6]], label %[[COND_TRUE7:.*]], label %[[COND_FALSE8:.*]]
+// AMDGPU: [[COND_TRUE7]]:
+// AMDGPU-NEXT: [[TMP10:%.*]] = load i32, ptr [[__RESULT_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[COND_END9:.*]]
+// AMDGPU: [[COND_FALSE8]]:
+// AMDGPU-NEXT: br label %[[COND_END9]]
+// AMDGPU: [[COND_END9]]:
+// AMDGPU-NEXT: [[COND10:%.*]] = phi i32 [ [[TMP10]], %[[COND_TRUE7]] ], [ 0, %[[COND_FALSE8]] ]
+// AMDGPU-NEXT: [[ADD:%.*]] = add i32 [[TMP8]], [[COND10]]
+// AMDGPU-NEXT: store i32 [[ADD]], ptr [[__X_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT: store i32 0, ptr [[__I_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[FOR_COND11:.*]]
+// AMDGPU: [[FOR_COND11]]:
+// AMDGPU-NEXT: [[TMP11:%.*]] = load i32, ptr [[__I_ASCAST]], align 4
+// AMDGPU-NEXT: [[TMP12:%.*]] = load i32, ptr [[__STEP_ASCAST]], align 4
+// AMDGPU-NEXT: [[CMP12:%.*]] = icmp ult i32 [[TMP11]], [[TMP12]]
+// AMDGPU-NEXT: br i1 [[CMP12]], label %[[FOR_BODY13:.*]], label %[[FOR_END:.*]]
+// AMDGPU: [[FOR_BODY13]]:
+// AMDGPU-NEXT: [[TMP13:%.*]] = load i64, ptr [[__BELOW_ASCAST]], align 8
+// AMDGPU-NEXT: [[TMP14:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP13]], i1 true)
+// AMDGPU-NEXT: [[CAST14:%.*]] = trunc i64 [[TMP14]] to i32
+// AMDGPU-NEXT: [[ISZERO:%.*]] = icmp eq i64 [[TMP13]], 0
+// AMDGPU-NEXT: [[CLZG:%.*]] = select i1 [[ISZERO]], i32 0, i32 [[CAST14]]
+// AMDGPU-NEXT: [[SUB15:%.*]] = sub nsw i32 63, [[CLZG]]
+// AMDGPU-NEXT: [[SH_PROM16:%.*]] = zext i32 [[SUB15]] to i64
+// AMDGPU-NEXT: [[SHL17:%.*]] = shl i64 1, [[SH_PROM16]]
+// AMDGPU-NEXT: [[TMP15:%.*]] = load i64, ptr [[__BELOW_ASCAST]], align 8
+// AMDGPU-NEXT: [[AND18:%.*]] = and i64 [[SHL17]], [[TMP15]]
+// AMDGPU-NEXT: [[TMP16:%.*]] = load i64, ptr [[__BELOW_ASCAST]], align 8
+// AMDGPU-NEXT: [[XOR:%.*]] = xor i64 [[TMP16]], [[AND18]]
+// AMDGPU-NEXT: store i64 [[XOR]], ptr [[__BELOW_ASCAST]], align 8
+// AMDGPU-NEXT: br label %[[FOR_INC:.*]]
+// AMDGPU: [[FOR_INC]]:
+// AMDGPU-NEXT: [[TMP17:%.*]] = load i32, ptr [[__I_ASCAST]], align 4
+// AMDGPU-NEXT: [[INC:%.*]] = add i32 [[TMP17]], 1
+// AMDGPU-NEXT: store i32 [[INC]], ptr [[__I_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[FOR_COND11]], !llvm.loop [[LOOP5:![0-9]+]]
+// AMDGPU: [[FOR_END]]:
+// AMDGPU-NEXT: br label %[[FOR_INC19:.*]]
+// AMDGPU: [[FOR_INC19]]:
+// AMDGPU-NEXT: [[TMP18:%.*]] = load i32, ptr [[__STEP_ASCAST]], align 4
+// AMDGPU-NEXT: [[MUL:%.*]] = mul i32 [[TMP18]], 2
+// AMDGPU-NEXT: store i32 [[MUL]], ptr [[__STEP_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// AMDGPU: [[FOR_END20]]:
+// AMDGPU-NEXT: [[TMP19:%.*]] = load i32, ptr [[__X_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT: ret i32 [[TMP19]]
+//
+//
// AMDGPU-LABEL: define internal void @__gpu_exit(
// AMDGPU-SAME: ) #[[ATTR1:[0-9]+]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
@@ -562,6 +671,7 @@ __gpu_kernel void foo() {
// NVPTX-NEXT: [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
// NVPTX-NEXT: [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
// NVPTX-NEXT: [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT: [[CALL25:%.*]] = call i32 @__gpu_prefix_scan_sum_u32(i64 noundef -1, i32 noundef 1) #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]]
// NVPTX-NEXT: unreachable
//
@@ -967,6 +1077,106 @@ __gpu_kernel void foo() {
// NVPTX-NEXT: ret i1 [[CMP]]
//
//
+// NVPTX-LABEL: define internal i32 @__gpu_prefix_scan_sum_u32(
+// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__X:%.*]]) #[[ATTR0]] {
+// NVPTX-NEXT: [[ENTRY:.*:]]
+// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// NVPTX-NEXT: [[__X_ADDR:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: [[__BELOW:%.*]] = alloca i64, align 8
+// NVPTX-NEXT: [[__STEP:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: [[__SRC:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: [[__RESULT:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: [[__I:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// NVPTX-NEXT: store i32 [[__X]], ptr [[__X_ADDR]], align 4
+// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]]
+// NVPTX-NEXT: [[SH_PROM:%.*]] = zext i32 [[CALL]] to i64
+// NVPTX-NEXT: [[SHL:%.*]] = shl i64 1, [[SH_PROM]]
+// NVPTX-NEXT: [[SUB:%.*]] = sub i64 [[SHL]], 1
+// NVPTX-NEXT: [[AND:%.*]] = and i64 [[TMP0]], [[SUB]]
+// NVPTX-NEXT: store i64 [[AND]], ptr [[__BELOW]], align 8
+// NVPTX-NEXT: store i32 1, ptr [[__STEP]], align 4
+// NVPTX-NEXT: br label %[[FOR_COND:.*]]
+// NVPTX: [[FOR_COND]]:
+// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[__STEP]], align 4
+// NVPTX-NEXT: [[CALL1:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]]
+// NVPTX-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], [[CALL1]]
+// NVPTX-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END20:.*]]
+// NVPTX: [[FOR_BODY]]:
+// NVPTX-NEXT: [[TMP2:%.*]] = load i64, ptr [[__BELOW]], align 8
+// NVPTX-NEXT: [[TOBOOL:%.*]] = icmp ne i64 [[TMP2]], 0
+// NVPTX-NEXT: br i1 [[TOBOOL]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// NVPTX: [[COND_TRUE]]:
+// NVPTX-NEXT: [[TMP3:%.*]] = load i64, ptr [[__BELOW]], align 8
+// NVPTX-NEXT: [[TMP4:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP3]], i1 true)
+// NVPTX-NEXT: [[CAST:%.*]] = trunc i64 [[TMP4]] to i32
+// NVPTX-NEXT: [[SUB2:%.*]] = sub nsw i32 63, [[CAST]]
+// NVPTX-NEXT: br label %[[COND_END:.*]]
+// NVPTX: [[COND_FALSE]]:
+// NVPTX-NEXT: [[CALL3:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]]
+// NVPTX-NEXT: br label %[[COND_END]]
+// NVPTX: [[COND_END]]:
+// NVPTX-NEXT: [[COND:%.*]] = phi i32 [ [[SUB2]], %[[COND_TRUE]] ], [ [[CALL3]], %[[COND_FALSE]] ]
+// NVPTX-NEXT: store i32 [[COND]], ptr [[__SRC]], align 4
+// NVPTX-NEXT: [[TMP5:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[__SRC]], align 4
+// NVPTX-NEXT: [[TMP7:%.*]] = load i32, ptr [[__X_ADDR]], align 4
+// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]]
+// NVPTX-NEXT: [[CALL5:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef [[TMP5]], i32 noundef [[TMP6]], i32 noundef [[TMP7]], i32 noundef [[CALL4]]) #[[ATTR6]]
+// NVPTX-NEXT: store i32 [[CALL5]], ptr [[__RESULT]], align 4
+// NVPTX-NEXT: [[TMP8:%.*]] = load i32, ptr [[__X_ADDR]], align 4
+// NVPTX-NEXT: [[TMP9:%.*]] = load i64, ptr [[__BELOW]], align 8
+// NVPTX-NEXT: [[TOBOOL6:%.*]] = icmp ne i64 [[TMP9]], 0
+// NVPTX-NEXT: br i1 [[TOBOOL6]], label %[[COND_TRUE7:.*]], label %[[COND_FALSE8:.*]]
+// NVPTX: [[COND_TRUE7]]:
+// NVPTX-NEXT: [[TMP10:%.*]] = load i32, ptr [[__RESULT]], align 4
+// NVPTX-NEXT: br label %[[COND_END9:.*]]
+// NVPTX: [[COND_FALSE8]]:
+// NVPTX-NEXT: br label %[[COND_END9]]
+// NVPTX: [[COND_END9]]:
+// NVPTX-NEXT: [[COND10:%.*]] = phi i32 [ [[TMP10]], %[[COND_TRUE7]] ], [ 0, %[[COND_FALSE8]] ]
+// NVPTX-NEXT: [[ADD:%.*]] = add i32 [[TMP8]], [[COND10]]
+// NVPTX-NEXT: store i32 [[ADD]], ptr [[__X_ADDR]], align 4
+// NVPTX-NEXT: store i32 0, ptr [[__I]], align 4
+// NVPTX-NEXT: br label %[[FOR_COND11:.*]]
+// NVPTX: [[FOR_COND11]]:
+// NVPTX-NEXT: [[TMP11:%.*]] = load i32, ptr [[__I]], align 4
+// NVPTX-NEXT: [[TMP12:%.*]] = load i32, ptr [[__STEP]], align 4
+// NVPTX-NEXT: [[CMP12:%.*]] = icmp ult i32 [[TMP11]], [[TMP12]]
+// NVPTX-NEXT: br i1 [[CMP12]], label %[[FOR_BODY13:.*]], label %[[FOR_END:.*]]
+// NVPTX: [[FOR_BODY13]]:
+// NVPTX-NEXT: [[TMP13:%.*]] = load i64, ptr [[__BELOW]], align 8
+// NVPTX-NEXT: [[TMP14:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP13]], i1 true)
+// NVPTX-NEXT: [[CAST14:%.*]] = trunc i64 [[TMP14]] to i32
+// NVPTX-NEXT: [[ISZERO:%.*]] = icmp eq i64 [[TMP13]], 0
+// NVPTX-NEXT: [[CLZG:%.*]] = select i1 [[ISZERO]], i32 0, i32 [[CAST14]]
+// NVPTX-NEXT: [[SUB15:%.*]] = sub nsw i32 63, [[CLZG]]
+// NVPTX-NEXT: [[SH_PROM16:%.*]] = zext i32 [[SUB15]] to i64
+// NVPTX-NEXT: [[SHL17:%.*]] = shl i64 1, [[SH_PROM16]]
+// NVPTX-NEXT: [[TMP15:%.*]] = load i64, ptr [[__BELOW]], align 8
+// NVPTX-NEXT: [[AND18:%.*]] = and i64 [[SHL17]], [[TMP15]]
+// NVPTX-NEXT: [[TMP16:%.*]] = load i64, ptr [[__BELOW]], align 8
+// NVPTX-NEXT: [[XOR:%.*]] = xor i64 [[TMP16]], [[AND18]]
+// NVPTX-NEXT: store i64 [[XOR]], ptr [[__BELOW]], align 8
+// NVPTX-NEXT: br label %[[FOR_INC:.*]]
+// NVPTX: [[FOR_INC]]:
+// NVPTX-NEXT: [[TMP17:%.*]] = load i32, ptr [[__I]], align 4
+// NVPTX-NEXT: [[INC:%.*]] = add i32 [[TMP17]], 1
+// NVPTX-NEXT: store i32 [[INC]], ptr [[__I]], align 4
+// NVPTX-NEXT: br label %[[FOR_COND11]], !llvm.loop [[LOOP1:![0-9]+]]
+// NVPTX: [[FOR_END]]:
+// NVPTX-NEXT: br label %[[FOR_INC19:.*]]
+// NVPTX: [[FOR_INC19]]:
+// NVPTX-NEXT: [[TMP18:%.*]] = load i32, ptr [[__STEP]], align 4
+// NVPTX-NEXT: [[MUL:%.*]] = mul i32 [[TMP18]], 2
+// NVPTX-NEXT: store i32 [[MUL]], ptr [[__STEP]], align 4
+// NVPTX-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// NVPTX: [[FOR_END20]]:
+// NVPTX-NEXT: [[TMP19:%.*]] = load i32, ptr [[__X_ADDR]], align 4
+// NVPTX-NEXT: ret i32 [[TMP19]]
+//
+//
// NVPTX-LABEL: define internal void @__gpu_exit(
// NVPTX-SAME: ) #[[ATTR1:[0-9]+]] {
// NVPTX-NEXT: [[ENTRY:.*:]]
@@ -1004,6 +1214,7 @@ __gpu_kernel void foo() {
// SPIRV-NEXT: [[CALL22:%.*]] = call spir_func i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0)
// SPIRV-NEXT: [[CALL23:%.*]] = call spir_func i64 @__gpu_first_lane_id(i64 noundef -1)
// SPIRV-NEXT: [[CALL24:%.*]] = call spir_func zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1)
+// SPIRV-NEXT: [[CALL25:%.*]] = call spir_func i32 @__gpu_prefix_scan_sum_u32(i64 noundef -1, i32 noundef 1)
// SPIRV-NEXT: call spir_func void @__gpu_exit() #[[ATTR7:[0-9]+]]
// SPIRV-NEXT: unreachable
//
@@ -1401,6 +1612,106 @@ __gpu_kernel void foo() {
// SPIRV-NEXT: ret i1 [[CMP]]
//
//
+// SPIRV-LABEL: define internal spir_func i32 @__gpu_prefix_scan_sum_u32(
+// SPIRV-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__X:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV-NEXT: [[__X_ADDR:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[__BELOW:%.*]] = alloca i64, align 8
+// SPIRV-NEXT: [[__STEP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[__SRC:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[__RESULT:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[__I:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[__X]], ptr [[__X_ADDR]], align 4
+// SPIRV-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_lane_id()
+// SPIRV-NEXT: [[SH_PROM:%.*]] = zext i32 [[CALL]] to i64
+// SPIRV-NEXT: [[SHL:%.*]] = shl i64 1, [[SH_PROM]]
+// SPIRV-NEXT: [[SUB:%.*]] = sub i64 [[SHL]], 1
+// SPIRV-NEXT: [[AND:%.*]] = and i64 [[TMP0]], [[SUB]]
+// SPIRV-NEXT: store i64 [[AND]], ptr [[__BELOW]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[__STEP]], align 4
+// SPIRV-NEXT: br label %[[FOR_COND:.*]]
+// SPIRV: [[FOR_COND]]:
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[__STEP]], align 4
+// SPIRV-NEXT: [[CALL1:%.*]] = call spir_func i32 @__gpu_num_lanes()
+// SPIRV-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], [[CALL1]]
+// SPIRV-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END20:.*]]
+// SPIRV: [[FOR_BODY]]:
+// SPIRV-NEXT: [[TMP2:%.*]] = load i64, ptr [[__BELOW]], align 8
+// SPIRV-NEXT: [[TOBOOL:%.*]] = icmp ne i64 [[TMP2]], 0
+// SPIRV-NEXT: br i1 [[TOBOOL]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// SPIRV: [[COND_TRUE]]:
+// SPIRV-NEXT: [[TMP3:%.*]] = load i64, ptr [[__BELOW]], align 8
+// SPIRV-NEXT: [[TMP4:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP3]], i1 true)
+// SPIRV-NEXT: [[CAST:%.*]] = trunc i64 [[TMP4]] to i32
+// SPIRV-NEXT: [[SUB2:%.*]] = sub nsw i32 63, [[CAST]]
+// SPIRV-NEXT: br label %[[COND_END:.*]]
+// SPIRV: [[COND_FALSE]]:
+// SPIRV-NEXT: [[CALL3:%.*]] = call spir_func i32 @__gpu_lane_id()
+// SPIRV-NEXT: br label %[[COND_END]]
+// SPIRV: [[COND_END]]:
+// SPIRV-NEXT: [[COND:%.*]] = phi i32 [ [[SUB2]], %[[COND_TRUE]] ], [ [[CALL3]], %[[COND_FALSE]] ]
+// SPIRV-NEXT: store i32 [[COND]], ptr [[__SRC]], align 4
+// SPIRV-NEXT: [[TMP5:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[__SRC]], align 4
+// SPIRV-NEXT: [[TMP7:%.*]] = load i32, ptr [[__X_ADDR]], align 4
+// SPIRV-NEXT: [[CALL4:%.*]] = call spir_func i32 @__gpu_num_lanes()
+// SPIRV-NEXT: [[CALL5:%.*]] = call spir_func i32 @__gpu_shuffle_idx_u32(i64 noundef [[TMP5]], i32 noundef [[TMP6]], i32 noundef [[TMP7]], i32 noundef [[CALL4]])
+// SPIRV-NEXT: store i32 [[CALL5]], ptr [[__RESULT]], align 4
+// SPIRV-NEXT: [[TMP8:%.*]] = load i32, ptr [[__X_ADDR]], align 4
+// SPIRV-NEXT: [[TMP9:%.*]] = load i64, ptr [[__BELOW]], align 8
+// SPIRV-NEXT: [[TOBOOL6:%.*]] = icmp ne i64 [[TMP9]], 0
+// SPIRV-NEXT: br i1 [[TOBOOL6]], label %[[COND_TRUE7:.*]], label %[[COND_FALSE8:.*]]
+// SPIRV: [[COND_TRUE7]]:
+// SPIRV-NEXT: [[TMP10:%.*]] = load i32, ptr [[__RESULT]], align 4
+// SPIRV-NEXT: br label %[[COND_END9:.*]]
+// SPIRV: [[COND_FALSE8]]:
+// SPIRV-NEXT: br label %[[COND_END9]]
+// SPIRV: [[COND_END9]]:
+// SPIRV-NEXT: [[COND10:%.*]] = phi i32 [ [[TMP10]], %[[COND_TRUE7]] ], [ 0, %[[COND_FALSE8]] ]
+// SPIRV-NEXT: [[ADD:%.*]] = add i32 [[TMP8]], [[COND10]]
+// SPIRV-NEXT: store i32 [[ADD]], ptr [[__X_ADDR]], align 4
+// SPIRV-NEXT: store i32 0, ptr [[__I]], align 4
+// SPIRV-NEXT: br label %[[FOR_COND11:.*]]
+// SPIRV: [[FOR_COND11]]:
+// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[__I]], align 4
+// SPIRV-NEXT: [[TMP12:%.*]] = load i32, ptr [[__STEP]], align 4
+// SPIRV-NEXT: [[CMP12:%.*]] = icmp ult i32 [[TMP11]], [[TMP12]]
+// SPIRV-NEXT: br i1 [[CMP12]], label %[[FOR_BODY13:.*]], label %[[FOR_END:.*]]
+// SPIRV: [[FOR_BODY13]]:
+// SPIRV-NEXT: [[TMP13:%.*]] = load i64, ptr [[__BELOW]], align 8
+// SPIRV-NEXT: [[TMP14:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP13]], i1 true)
+// SPIRV-NEXT: [[CAST14:%.*]] = trunc i64 [[TMP14]] to i32
+// SPIRV-NEXT: [[ISZERO:%.*]] = icmp eq i64 [[TMP13]], 0
+// SPIRV-NEXT: [[CLZG:%.*]] = select i1 [[ISZERO]], i32 0, i32 [[CAST14]]
+// SPIRV-NEXT: [[SUB15:%.*]] = sub nsw i32 63, [[CLZG]]
+// SPIRV-NEXT: [[SH_PROM16:%.*]] = zext i32 [[SUB15]] to i64
+// SPIRV-NEXT: [[SHL17:%.*]] = shl i64 1, [[SH_PROM16]]
+// SPIRV-NEXT: [[TMP15:%.*]] = load i64, ptr [[__BELOW]], align 8
+// SPIRV-NEXT: [[AND18:%.*]] = and i64 [[SHL17]], [[TMP15]]
+// SPIRV-NEXT: [[TMP16:%.*]] = load i64, ptr [[__BELOW]], align 8
+// SPIRV-NEXT: [[XOR:%.*]] = xor i64 [[TMP16]], [[AND18]]
+// SPIRV-NEXT: store i64 [[XOR]], ptr [[__BELOW]], align 8
+// SPIRV-NEXT: br label %[[FOR_INC:.*]]
+// SPIRV: [[FOR_INC]]:
+// SPIRV-NEXT: [[TMP17:%.*]] = load i32, ptr [[__I]], align 4
+// SPIRV-NEXT: [[INC:%.*]] = add i32 [[TMP17]], 1
+// SPIRV-NEXT: store i32 [[INC]], ptr [[__I]], align 4
+// SPIRV-NEXT: br label %[[FOR_COND11]], !llvm.loop [[LOOP1:![0-9]+]]
+// SPIRV: [[FOR_END]]:
+// SPIRV-NEXT: br label %[[FOR_INC19:.*]]
+// SPIRV: [[FOR_INC19]]:
+// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[__STEP]], align 4
+// SPIRV-NEXT: [[MUL:%.*]] = mul i32 [[TMP18]], 2
+// SPIRV-NEXT: store i32 [[MUL]], ptr [[__STEP]], align 4
+// SPIRV-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// SPIRV: [[FOR_END20]]:
+// SPIRV-NEXT: [[TMP19:%.*]] = load i32, ptr [[__X_ADDR]], align 4
+// SPIRV-NEXT: ret i32 [[TMP19]]
+//
+//
// SPIRV-LABEL: define internal spir_func void @__gpu_exit(
// SPIRV-SAME: ) #[[ATTR1:[0-9]+]] {
// SPIRV-NEXT: [[ENTRY:.*:]]
@@ -1411,4 +1722,15 @@ __gpu_kernel void foo() {
// AMDGPU: [[RNG2]] = !{i32 1, i32 0}
// AMDGPU: [[META3]] = !{}
// AMDGPU: [[RNG4]] = !{i16 1, i16 1025}
+// AMDGPU: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]]}
+// AMDGPU: [[META6]] = !{!"llvm.loop.mustprogress"}
+// AMDGPU: [[LOOP7]] = distinct !{[[LOOP7]], [[META6]]}
+//.
+// NVPTX: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]]}
+// NVPTX: [[META2]] = !{!"llvm.loop.mustprogress"}
+// NVPTX: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]]}
+//.
+// SPIRV: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]]}
+// SPIRV: [[META2]] = !{!"llvm.loop.mustprogress"}
+// SPIRV: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]]}
//.
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index 1b3e6edfc4e0d..1916f57959037 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -123,7 +123,7 @@ LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
}
LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
- return __gpu_lane_scan_u32(lane_mask, x);
+ return __gpu_prefix_scan_sum_u32(lane_mask, x);
}
LIBC_INLINE uint64_t fixed_frequency_clock() {
More information about the cfe-commits
mailing list