[clang] [libc] [Clang] Update the 'gpuintrin.h' lane scan handling (PR #185451)
Matt Arsenault via cfe-commits
cfe-commits at lists.llvm.org
Mon Mar 9 10:40:11 PDT 2026
================
@@ -201,67 +201,55 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
__builtin_bit_cast(uint64_t, __x), __width));
}
-// Gets the accumulator scan of the threads in the warp or wavefront.
-#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
- _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
- uint64_t __lane_mask, uint32_t __x) { \
- uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
- bool __divergent = __gpu_read_first_lane_##__suffix( \
- __lane_mask, __first & (__first + 1)); \
- if (__divergent) { \
- __type __accum = 0; \
- for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) { \
- __type __index = __builtin_ctzll(__mask); \
- __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
- __gpu_num_lanes()); \
- __x = __gpu_lane_id() == __index ? __accum + __tmp : __x; \
- __accum += __tmp; \
- } \
- } else { \
- for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
- uint32_t __index = __gpu_lane_id() - __step; \
- __bitmask_type bitmask = __gpu_lane_id() >= __step; \
- __x += __builtin_bit_cast( \
- __type, \
- -bitmask & __builtin_bit_cast(__bitmask_type, \
- __gpu_shuffle_idx_##__suffix( \
- __lane_mask, __index, __x, \
- __gpu_num_lanes()))); \
- } \
+// Implements scan and reduction operations across a GPU warp or wavefront.
+//
+// Both scans work by iterating log2(N) steps. The bitmask tracks the currently
+// unprocessed lanes, above or below the current lane in the case of a suffix or
+// prefix scan. Each iteration we shuffle in the unprocessed neighbors and then
+// clear the bits that this operation handled.
+#define __DO_LANE_OP(__type, __op, __identity, __prefix, __suffix) \
+ _DEFAULT_FN_ATTRS static __inline__ __type \
+ __gpu_suffix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
+ __type __x) { \
+ uint64_t __above = __lane_mask & -(1ull << (__gpu_lane_id() + 1)); \
----------------
arsenm wrote:
Currently this pattern will fail to reduce to 32-bits on wave32. It's probably worthwhile to conditionalize it on the wavesize until the optimizations are fixed, but can do that in a follow up I guess
https://github.com/llvm/llvm-project/pull/185451
More information about the cfe-commits
mailing list