[clang] [libc] [Clang] Update the 'gpuintrin.h' lane scan handling (PR #185451)

Mon Mar 9 09:30:29 PDT 2026

https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/185451

Summary:
This patch uses a more efficient algorithm for the reduction rather than
a divergent branch. We also provide a prefix and suffix version, the sum
is now just the first element of this.

This changes the name to this, which is technically breaking but I don't
think these were really used in practice and it's a trivial change based
on the clang version if it's really needed..
```
__gpu_prefix_scan_sum_u32(...)
__gpu_suffix_scan_sum_u32(...)
```


>From e359de3fc7ec2782f69096422f6af446097dbb8f Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 9 Mar 2026 11:21:13 -0500
Subject: [PATCH] [Clang] Update the 'gpuintrin.h' lane scan handling

Summary:
This patch uses a more efficient algorithm for the reduction rather than
a divergent branch. We also provide a prefix and suffix version, the sum
is now just the first element of this.

This changes the name to this, which is technically breaking but I don't
think these were really used in practice and it's a trivial change based
on the clang version if it's really needed..
```
__gpu_prefix_scan_sum_u32(...)
__gpu_suffix_scan_sum_u32(...)
```
---
 clang/lib/Headers/gpuintrin.h  | 90 +++++++++++++++-------------------
 libc/src/__support/GPU/utils.h |  2 +-
 2 files changed, 40 insertions(+), 52 deletions(-)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4335ad8c83ddd..aa04b54970a34 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -201,66 +201,54 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
                             __builtin_bit_cast(uint64_t, __x), __width));
 }
 
-// Gets the accumulator scan of the threads in the warp or wavefront.
-#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)                       \
-  _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix(     \
-      uint64_t __lane_mask, uint32_t __x) {                                    \
-    uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
-    bool __divergent = __gpu_read_first_lane_##__suffix(                       \
-        __lane_mask, __first & (__first + 1));                                 \
-    if (__divergent) {                                                         \
-      __type __accum = 0;                                                      \
-      for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) {      \
-        __type __index = __builtin_ctzll(__mask);                              \
-        __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
-                                                    __gpu_num_lanes());        \
-        __x = __gpu_lane_id() == __index ? __accum + __tmp : __x;              \
-        __accum += __tmp;                                                      \
-      }                                                                        \
-    } else {                                                                   \
-      for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
-        uint32_t __index = __gpu_lane_id() - __step;                           \
-        __bitmask_type bitmask = __gpu_lane_id() >= __step;                    \
-        __x += __builtin_bit_cast(                                             \
-            __type,                                                            \
-            -bitmask & __builtin_bit_cast(__bitmask_type,                      \
-                                          __gpu_shuffle_idx_##__suffix(        \
-                                              __lane_mask, __index, __x,       \
-                                              __gpu_num_lanes())));            \
-      }                                                                        \
+// Gets the suffix scan (inclusive scan from right) of the warp or wavefront.
+#define __DO_SUFFIX_SCAN_SUM(__type, __suffix)                                 \
+  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_suffix_scan_sum_##__suffix( \
+      uint64_t __lane_mask, __type __x) {                                      \
+    uint64_t __above = __lane_mask & -(2ull << __gpu_lane_id());               \
+    for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {       \
+      uint32_t __src = __above ? __builtin_ctzg(__above) : __gpu_lane_id();    \
+      __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x,  \
+                                                     __gpu_num_lanes());       \
+      __x += __above ? __result : (__type)0;                                   \
+      for (uint32_t __i = 0; __i < __step; ++__i)                              \
+        __above &= __above - 1;                                                \
     }                                                                          \
     return __x;                                                                \
   }
-__DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x)
-__DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x)
-__DO_LANE_SCAN(float, uint32_t, f32);    // float __gpu_lane_scan_f32(m, x)
-__DO_LANE_SCAN(double, uint64_t, f64);   // double __gpu_lane_scan_f64(m, x)
-#undef __DO_LANE_SCAN
+__DO_SUFFIX_SCAN_SUM(uint32_t, u32);
+__DO_SUFFIX_SCAN_SUM(uint64_t, u64);
+__DO_SUFFIX_SCAN_SUM(float, f32);
+__DO_SUFFIX_SCAN_SUM(double, f64);
+#undef __DO_SUFFIX_SCAN_SUM
+
+// Gets the prefix scan (inclusive scan from left) of the warp or wavefront.
+#define __DO_PREFIX_SCAN_SUM(__type, __suffix)                                 \
+  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_prefix_scan_sum_##__suffix( \
+      uint64_t __lane_mask, __type __x) {                                      \
+    __type __y = __gpu_suffix_scan_sum_##__suffix(__lane_mask, __x);           \
+    return __gpu_shuffle_idx_##__suffix(__lane_mask,                           \
+                                        __builtin_ctzg(__lane_mask), __y,      \
+                                        __gpu_num_lanes()) -                   \
+           __y + __x;                                                          \
+  }
+__DO_PREFIX_SCAN_SUM(uint32_t, u32);
+__DO_PREFIX_SCAN_SUM(uint64_t, u64);
+__DO_PREFIX_SCAN_SUM(float, f32);
+__DO_PREFIX_SCAN_SUM(double, f64);
+#undef __DO_PREFIX_SCAN_SUM
 
 // Gets the sum of all lanes inside the warp or wavefront.
 #define __DO_LANE_SUM(__type, __suffix)                                        \
   _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
       uint64_t __lane_mask, __type __x) {                                      \
-    uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
-    bool __divergent = __gpu_read_first_lane_##__suffix(                       \
-        __lane_mask, __first & (__first + 1));                                 \
-    if (__divergent) {                                                         \
-      return __gpu_shuffle_idx_##__suffix(                                     \
-          __lane_mask, 63 - __builtin_clzll(__lane_mask),                      \
-          __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes());    \
-    } else {                                                                   \
-      for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
-        uint32_t __index = __step + __gpu_lane_id();                           \
-        __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,         \
-                                            __gpu_num_lanes());                \
-      }                                                                        \
-      return __gpu_read_first_lane_##__suffix(__lane_mask, __x);               \
-    }                                                                          \
+    return __gpu_read_first_lane_##__suffix(                                   \
+        __lane_mask, __gpu_suffix_scan_sum_##__suffix(__lane_mask, __x));      \
   }
-__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
-__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
-__DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
-__DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
+__DO_LANE_SUM(uint32_t, u32);
+__DO_LANE_SUM(uint64_t, u64);
+__DO_LANE_SUM(float, f32);
+__DO_LANE_SUM(double, f64);
 #undef __DO_LANE_SUM
 
 // Returns a bitmask marking all lanes that have the same value of __x.
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index 1b3e6edfc4e0d..1916f57959037 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -123,7 +123,7 @@ LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
 }
 
 LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
-  return __gpu_lane_scan_u32(lane_mask, x);
+  return __gpu_prefix_scan_sum_u32(lane_mask, x);
 }
 
 LIBC_INLINE uint64_t fixed_frequency_clock() {