[libc-commits] [clang] [libc] [Clang] Improve scan in gpuintrin.h (PR #189381)

Mon Mar 30 06:43:04 PDT 2026

llvmbot wrote:



@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-libc

Author: Joseph Huber (jhuber6)

<details>
<summary>Changes</summary>

Summary:
Right now the scan checks to avoid the unspecified behavior in
`clzg(0)`. This is used as the source to the shuffle instruction, but
the argument is discarded at zero anyway. So, we simply pass unspecified
behavior to shuffle and then discard it. This should be fine. The scan
routines are expected to be optimal.

Also renames `sum` to `add`.


---
Full diff: https://github.com/llvm/llvm-project/pull/189381.diff


2 Files Affected:

- (modified) clang/lib/Headers/gpuintrin.h (+6-7) 
- (modified) libc/src/__support/GPU/utils.h (+2-2) 


``````````diff

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index ef1446a3ac77b..be7d4aa7db1f5 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -213,7 +213,7 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
                                             __type __x) {                      \
     uint64_t __above = __lane_mask & -(2ull << __gpu_lane_id());               \
     for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {       \
-      uint32_t __src = __above ? __builtin_ctzg(__above) : __gpu_lane_id();    \
+      uint32_t __src = __builtin_ctzg(__above);                                \
       __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x,  \
                                                      __gpu_num_lanes());       \
       __x = __op(__x, __above ? __result : (__type)__identity);                \
@@ -228,8 +228,7 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
                                             __type __x) {                      \
     uint64_t __below = __lane_mask & ((1ull << __gpu_lane_id()) - 1);          \
     for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {       \
-      uint32_t __src =                                                         \
-          __below ? (63 - __builtin_clzg(__below)) : __gpu_lane_id();          \
+      uint32_t __src = 63 - __builtin_clzg(__below);                           \
       __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x,  \
                                                      __gpu_num_lanes());       \
       __x = __op(__x, __below ? __result : (__type)__identity);                \
@@ -247,10 +246,10 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
   }
 
 #define __GPU_OP(__x, __y) ((__x) + (__y))
-__DO_LANE_OPS(uint32_t, __GPU_OP, 0, sum, u32);
-__DO_LANE_OPS(uint64_t, __GPU_OP, 0, sum, u64);
-__DO_LANE_OPS(float, __GPU_OP, 0, sum, f32);
-__DO_LANE_OPS(double, __GPU_OP, 0, sum, f64);
+__DO_LANE_OPS(uint32_t, __GPU_OP, 0, add, u32);
+__DO_LANE_OPS(uint64_t, __GPU_OP, 0, add, u64);
+__DO_LANE_OPS(float, __GPU_OP, 0, add, f32);
+__DO_LANE_OPS(double, __GPU_OP, 0, add, f64);
 #undef __GPU_OP
 
 #define __GPU_OP(__x, __y) ((__x) & (__y))
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index 1916f57959037..e12684bf25c6a 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -119,11 +119,11 @@ LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
 }
 
 LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
-  return __gpu_lane_sum_u32(lane_mask, x);
+  return __gpu_lane_add_u32(lane_mask, x);
 }
 
 LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
-  return __gpu_prefix_scan_sum_u32(lane_mask, x);
+  return __gpu_prefix_scan_add_u32(lane_mask, x);
 }
 
 LIBC_INLINE uint64_t fixed_frequency_clock() {

``````````

</details>


https://github.com/llvm/llvm-project/pull/189381