[clang] [libc] [Clang] Update the 'gpuintrin.h' lane scan handling (PR #185451)
Joseph Huber via cfe-commits
cfe-commits at lists.llvm.org
Mon Mar 9 10:42:51 PDT 2026
================
@@ -201,67 +201,55 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
__builtin_bit_cast(uint64_t, __x), __width));
}
-// Gets the accumulator scan of the threads in the warp or wavefront.
-#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
- _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
- uint64_t __lane_mask, uint32_t __x) { \
- uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
- bool __divergent = __gpu_read_first_lane_##__suffix( \
- __lane_mask, __first & (__first + 1)); \
- if (__divergent) { \
- __type __accum = 0; \
- for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) { \
- __type __index = __builtin_ctzll(__mask); \
- __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
- __gpu_num_lanes()); \
- __x = __gpu_lane_id() == __index ? __accum + __tmp : __x; \
- __accum += __tmp; \
- } \
- } else { \
- for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
- uint32_t __index = __gpu_lane_id() - __step; \
- __bitmask_type bitmask = __gpu_lane_id() >= __step; \
- __x += __builtin_bit_cast( \
- __type, \
- -bitmask & __builtin_bit_cast(__bitmask_type, \
- __gpu_shuffle_idx_##__suffix( \
- __lane_mask, __index, __x, \
- __gpu_num_lanes()))); \
- } \
+// Implements scan and reduction operations across a GPU warp or wavefront.
+//
+// Both scans work by iterating log2(N) steps. The bitmask tracks the currently
+// unprocessed lanes, above or below the current lane in the case of a suffix or
+// prefix scan. Each iteration we shuffle in the unprocessed neighbors and then
+// clear the bits that this operation handled.
+#define __DO_LANE_OP(__type, __op, __identity, __prefix, __suffix) \
+ _DEFAULT_FN_ATTRS static __inline__ __type \
+ __gpu_suffix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
+ __type __x) { \
+ uint64_t __above = __lane_mask & -(1ull << (__gpu_lane_id() + 1)); \
----------------
jhuber6 wrote:
This is intended to get a mask of all the active lanes above this one. For lane id `63` on w64 this will be `1 << 64` which is zero, which correctly masks that there are no lanes above this one. I could build and test this on a w64 machine to be extra safe however.
https://github.com/llvm/llvm-project/pull/185451
More information about the cfe-commits
mailing list