[clang] [libc] [Clang] Update the 'gpuintrin.h' lane scan handling (PR #185451)
Joseph Huber via cfe-commits
cfe-commits at lists.llvm.org
Mon Mar 9 09:30:29 PDT 2026
https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/185451
Summary:
This patch uses a more efficient algorithm for the reduction rather than
a divergent branch. We also provide a prefix and suffix version, the sum
is now just the first element of this.
This changes the name to this, which is technically breaking but I don't
think these were really used in practice and it's a trivial change based
on the clang version if it's really needed..
```
__gpu_prefix_scan_sum_u32(...)
__gpu_suffix_scan_sum_u32(...)
```
>From e359de3fc7ec2782f69096422f6af446097dbb8f Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 9 Mar 2026 11:21:13 -0500
Subject: [PATCH] [Clang] Update the 'gpuintrin.h' lane scan handling
Summary:
This patch uses a more efficient algorithm for the reduction rather than
a divergent branch. We also provide a prefix and suffix version, the sum
is now just the first element of this.
This changes the name to this, which is technically breaking but I don't
think these were really used in practice and it's a trivial change based
on the clang version if it's really needed..
```
__gpu_prefix_scan_sum_u32(...)
__gpu_suffix_scan_sum_u32(...)
```
---
clang/lib/Headers/gpuintrin.h | 90 +++++++++++++++-------------------
libc/src/__support/GPU/utils.h | 2 +-
2 files changed, 40 insertions(+), 52 deletions(-)
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4335ad8c83ddd..aa04b54970a34 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -201,66 +201,54 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
__builtin_bit_cast(uint64_t, __x), __width));
}
-// Gets the accumulator scan of the threads in the warp or wavefront.
-#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
- _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
- uint64_t __lane_mask, uint32_t __x) { \
- uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
- bool __divergent = __gpu_read_first_lane_##__suffix( \
- __lane_mask, __first & (__first + 1)); \
- if (__divergent) { \
- __type __accum = 0; \
- for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) { \
- __type __index = __builtin_ctzll(__mask); \
- __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
- __gpu_num_lanes()); \
- __x = __gpu_lane_id() == __index ? __accum + __tmp : __x; \
- __accum += __tmp; \
- } \
- } else { \
- for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
- uint32_t __index = __gpu_lane_id() - __step; \
- __bitmask_type bitmask = __gpu_lane_id() >= __step; \
- __x += __builtin_bit_cast( \
- __type, \
- -bitmask & __builtin_bit_cast(__bitmask_type, \
- __gpu_shuffle_idx_##__suffix( \
- __lane_mask, __index, __x, \
- __gpu_num_lanes()))); \
- } \
+// Gets the suffix scan (inclusive scan from right) of the warp or wavefront.
+#define __DO_SUFFIX_SCAN_SUM(__type, __suffix) \
+ _DEFAULT_FN_ATTRS static __inline__ __type __gpu_suffix_scan_sum_##__suffix( \
+ uint64_t __lane_mask, __type __x) { \
+ uint64_t __above = __lane_mask & -(2ull << __gpu_lane_id()); \
+ for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
+ uint32_t __src = __above ? __builtin_ctzg(__above) : __gpu_lane_id(); \
+ __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
+ __gpu_num_lanes()); \
+ __x += __above ? __result : (__type)0; \
+ for (uint32_t __i = 0; __i < __step; ++__i) \
+ __above &= __above - 1; \
} \
return __x; \
}
-__DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x)
-__DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x)
-__DO_LANE_SCAN(float, uint32_t, f32); // float __gpu_lane_scan_f32(m, x)
-__DO_LANE_SCAN(double, uint64_t, f64); // double __gpu_lane_scan_f64(m, x)
-#undef __DO_LANE_SCAN
+__DO_SUFFIX_SCAN_SUM(uint32_t, u32);
+__DO_SUFFIX_SCAN_SUM(uint64_t, u64);
+__DO_SUFFIX_SCAN_SUM(float, f32);
+__DO_SUFFIX_SCAN_SUM(double, f64);
+#undef __DO_SUFFIX_SCAN_SUM
+
+// Gets the prefix scan (inclusive scan from left) of the warp or wavefront.
+#define __DO_PREFIX_SCAN_SUM(__type, __suffix) \
+ _DEFAULT_FN_ATTRS static __inline__ __type __gpu_prefix_scan_sum_##__suffix( \
+ uint64_t __lane_mask, __type __x) { \
+ __type __y = __gpu_suffix_scan_sum_##__suffix(__lane_mask, __x); \
+ return __gpu_shuffle_idx_##__suffix(__lane_mask, \
+ __builtin_ctzg(__lane_mask), __y, \
+ __gpu_num_lanes()) - \
+ __y + __x; \
+ }
+__DO_PREFIX_SCAN_SUM(uint32_t, u32);
+__DO_PREFIX_SCAN_SUM(uint64_t, u64);
+__DO_PREFIX_SCAN_SUM(float, f32);
+__DO_PREFIX_SCAN_SUM(double, f64);
+#undef __DO_PREFIX_SCAN_SUM
// Gets the sum of all lanes inside the warp or wavefront.
#define __DO_LANE_SUM(__type, __suffix) \
_DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
uint64_t __lane_mask, __type __x) { \
- uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
- bool __divergent = __gpu_read_first_lane_##__suffix( \
- __lane_mask, __first & (__first + 1)); \
- if (__divergent) { \
- return __gpu_shuffle_idx_##__suffix( \
- __lane_mask, 63 - __builtin_clzll(__lane_mask), \
- __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes()); \
- } else { \
- for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
- uint32_t __index = __step + __gpu_lane_id(); \
- __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
- __gpu_num_lanes()); \
- } \
- return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
- } \
+ return __gpu_read_first_lane_##__suffix( \
+ __lane_mask, __gpu_suffix_scan_sum_##__suffix(__lane_mask, __x)); \
}
-__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
-__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
-__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
-__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
+__DO_LANE_SUM(uint32_t, u32);
+__DO_LANE_SUM(uint64_t, u64);
+__DO_LANE_SUM(float, f32);
+__DO_LANE_SUM(double, f64);
#undef __DO_LANE_SUM
// Returns a bitmask marking all lanes that have the same value of __x.
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index 1b3e6edfc4e0d..1916f57959037 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -123,7 +123,7 @@ LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
}
LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
- return __gpu_lane_scan_u32(lane_mask, x);
+ return __gpu_prefix_scan_sum_u32(lane_mask, x);
}
LIBC_INLINE uint64_t fixed_frequency_clock() {
More information about the cfe-commits
mailing list