[libc-commits] [clang] [libc] [Clang] Improve scan in gpuintrin.h (PR #189381)
Joseph Huber via libc-commits
libc-commits at lists.llvm.org
Mon Mar 30 06:59:59 PDT 2026
https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/189381
>From 2dd6b6b4c097c5b8a73c804e040f16c59a0c9d6e Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 30 Mar 2026 08:36:56 -0500
Subject: [PATCH] [Clang] Improve scan in gpuintrin.h
Summary:
Right now the scan checks to avoid the unspecified behavior in
`clzg(0)`. This is used as the source to the shuffle instruction, but
the argument is discarded at zero anyway. So, we simply pass unspecified
behavior to shuffle and then discard it. This should be fine. The scan
routines are expected to be optimal.
Also renames `sum` to `add`.
---
clang/lib/Headers/gpuintrin.h | 13 ++++++-------
libc/src/__support/GPU/utils.h | 4 ++--
2 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index ef1446a3ac77b..f3628e781a9f0 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -213,7 +213,7 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
__type __x) { \
uint64_t __above = __lane_mask & -(2ull << __gpu_lane_id()); \
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
- uint32_t __src = __above ? __builtin_ctzg(__above) : __gpu_lane_id(); \
+ uint32_t __src = __builtin_ctzg(__above, -1); \
__type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
__gpu_num_lanes()); \
__x = __op(__x, __above ? __result : (__type)__identity); \
@@ -228,8 +228,7 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
__type __x) { \
uint64_t __below = __lane_mask & ((1ull << __gpu_lane_id()) - 1); \
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
- uint32_t __src = \
- __below ? (63 - __builtin_clzg(__below)) : __gpu_lane_id(); \
+ uint32_t __src = 63 - __builtin_clzg(__below, -1); \
__type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
__gpu_num_lanes()); \
__x = __op(__x, __below ? __result : (__type)__identity); \
@@ -247,10 +246,10 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
}
#define __GPU_OP(__x, __y) ((__x) + (__y))
-__DO_LANE_OPS(uint32_t, __GPU_OP, 0, sum, u32);
-__DO_LANE_OPS(uint64_t, __GPU_OP, 0, sum, u64);
-__DO_LANE_OPS(float, __GPU_OP, 0, sum, f32);
-__DO_LANE_OPS(double, __GPU_OP, 0, sum, f64);
+__DO_LANE_OPS(uint32_t, __GPU_OP, 0, add, u32);
+__DO_LANE_OPS(uint64_t, __GPU_OP, 0, add, u64);
+__DO_LANE_OPS(float, __GPU_OP, 0, add, f32);
+__DO_LANE_OPS(double, __GPU_OP, 0, add, f64);
#undef __GPU_OP
#define __GPU_OP(__x, __y) ((__x) & (__y))
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index 1916f57959037..e12684bf25c6a 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -119,11 +119,11 @@ LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
}
LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
- return __gpu_lane_sum_u32(lane_mask, x);
+ return __gpu_lane_add_u32(lane_mask, x);
}
LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
- return __gpu_prefix_scan_sum_u32(lane_mask, x);
+ return __gpu_prefix_scan_add_u32(lane_mask, x);
}
LIBC_INLINE uint64_t fixed_frequency_clock() {
More information about the libc-commits
mailing list