[clang] [libc] [Clang] Update the 'gpuintrin.h' lane scan handling (PR #185451)

Mon Mar 9 10:40:11 PDT 2026

================
@@ -201,67 +201,55 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
                             __builtin_bit_cast(uint64_t, __x), __width));
 }
 
-// Gets the accumulator scan of the threads in the warp or wavefront.
-#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)                       \
-  _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix(     \
-      uint64_t __lane_mask, uint32_t __x) {                                    \
-    uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
-    bool __divergent = __gpu_read_first_lane_##__suffix(                       \
-        __lane_mask, __first & (__first + 1));                                 \
-    if (__divergent) {                                                         \
-      __type __accum = 0;                                                      \
-      for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) {      \
-        __type __index = __builtin_ctzll(__mask);                              \
-        __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
-                                                    __gpu_num_lanes());        \
-        __x = __gpu_lane_id() == __index ? __accum + __tmp : __x;              \
-        __accum += __tmp;                                                      \
-      }                                                                        \
-    } else {                                                                   \
-      for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
-        uint32_t __index = __gpu_lane_id() - __step;                           \
-        __bitmask_type bitmask = __gpu_lane_id() >= __step;                    \
-        __x += __builtin_bit_cast(                                             \
-            __type,                                                            \
-            -bitmask & __builtin_bit_cast(__bitmask_type,                      \
-                                          __gpu_shuffle_idx_##__suffix(        \
-                                              __lane_mask, __index, __x,       \
-                                              __gpu_num_lanes())));            \
-      }                                                                        \
+// Implements scan and reduction operations across a GPU warp or wavefront.
+//
+// Both scans work by iterating log2(N) steps. The bitmask tracks the currently
+// unprocessed lanes, above or below the current lane in the case of a suffix or
+// prefix scan. Each iteration we shuffle in the unprocessed neighbors and then
+// clear the bits that this operation handled.
+#define __DO_LANE_OP(__type, __op, __identity, __prefix, __suffix)             \
+  _DEFAULT_FN_ATTRS static __inline__ __type                                   \
+  __gpu_suffix_scan_##__prefix##_##__suffix(uint64_t __lane_mask,              \
+                                            __type __x) {                      \
+    uint64_t __above = __lane_mask & -(1ull << (__gpu_lane_id() + 1));         \
----------------
arsenm wrote:

Currently this pattern will fail to reduce to 32-bits on wave32. It's probably worthwhile to conditionalize it on the wavesize until the optimizations are fixed, but can do that in a follow up I guess 

https://github.com/llvm/llvm-project/pull/185451