[llvm-branch-commits] [clang] [libc] release/20.x: [Clang] Fix cross-lane scan when given divergent lanes (#127703) (PR #128085)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Feb 20 15:12:06 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-libc
Author: None (llvmbot)
<details>
<summary>Changes</summary>
Backport 6cc7ca084a5bbb7ccf606cab12065604453dde59
Requested by: @<!-- -->jhuber6
---
Full diff: https://github.com/llvm/llvm-project/pull/128085.diff
3 Files Affected:
- (modified) clang/lib/Headers/gpuintrin.h (+49-25)
- (modified) clang/lib/Headers/nvptxintrin.h (+4-1)
- (modified) libc/test/integration/src/__support/GPU/scan_reduce.cpp (+49)
``````````diff
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 11c87e85cd497..efdc3d94ac0b3 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -150,35 +150,33 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
__builtin_bit_cast(uint64_t, __x), __width));
}
-// Gets the sum of all lanes inside the warp or wavefront.
-#define __DO_LANE_SUM(__type, __suffix) \
- _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
- uint64_t __lane_mask, __type __x) { \
- for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
- uint32_t __index = __step + __gpu_lane_id(); \
- __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
- __gpu_num_lanes()); \
- } \
- return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
- }
-__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
-__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
-__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
-__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
-#undef __DO_LANE_SUM
-
// Gets the accumulator scan of the threads in the warp or wavefront.
#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
uint64_t __lane_mask, uint32_t __x) { \
- for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
- uint32_t __index = __gpu_lane_id() - __step; \
- __bitmask_type bitmask = __gpu_lane_id() >= __step; \
- __x += __builtin_bit_cast( \
- __type, -bitmask & __builtin_bit_cast(__bitmask_type, \
- __gpu_shuffle_idx_##__suffix( \
- __lane_mask, __index, __x, \
- __gpu_num_lanes()))); \
+ uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
+ bool __divergent = __gpu_read_first_lane_##__suffix( \
+ __lane_mask, __first & (__first + 1)); \
+ if (__divergent) { \
+ __type __accum = 0; \
+ for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) { \
+ __type __index = __builtin_ctzll(__mask); \
+ __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
+ __gpu_num_lanes()); \
+ __x = __gpu_lane_id() == __index ? __accum + __tmp : __x; \
+ __accum += __tmp; \
+ } \
+ } else { \
+ for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
+ uint32_t __index = __gpu_lane_id() - __step; \
+ __bitmask_type bitmask = __gpu_lane_id() >= __step; \
+ __x += __builtin_bit_cast( \
+ __type, \
+ -bitmask & __builtin_bit_cast(__bitmask_type, \
+ __gpu_shuffle_idx_##__suffix( \
+ __lane_mask, __index, __x, \
+ __gpu_num_lanes()))); \
+ } \
} \
return __x; \
}
@@ -188,6 +186,32 @@ __DO_LANE_SCAN(float, uint32_t, f32); // float __gpu_lane_scan_f32(m, x)
__DO_LANE_SCAN(double, uint64_t, f64); // double __gpu_lane_scan_f64(m, x)
#undef __DO_LANE_SCAN
+// Gets the sum of all lanes inside the warp or wavefront.
+#define __DO_LANE_SUM(__type, __suffix) \
+ _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
+ uint64_t __lane_mask, __type __x) { \
+ uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
+ bool __divergent = __gpu_read_first_lane_##__suffix( \
+ __lane_mask, __first & (__first + 1)); \
+ if (__divergent) { \
+ return __gpu_shuffle_idx_##__suffix( \
+ __lane_mask, 63 - __builtin_clzll(__lane_mask), \
+ __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes()); \
+ } else { \
+ for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
+ uint32_t __index = __step + __gpu_lane_id(); \
+ __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
+ __gpu_num_lanes()); \
+ } \
+ return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
+ } \
+ }
+__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
+__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
+__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
+__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
+#undef __DO_LANE_SUM
+
_Pragma("omp end declare variant");
_Pragma("omp end declare target");
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 40fa2edebe975..0afcb1c5ff0f0 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -151,8 +151,11 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
_DEFAULT_FN_ATTRS static __inline__ uint32_t
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
uint32_t __width) {
+ // Mask out inactive lanes to match AMDGPU behavior.
uint32_t __mask = (uint32_t)__lane_mask;
- return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+ bool __bitmask = (1ull << __idx) & __lane_mask;
+ return -__bitmask &
+ __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
((__gpu_num_lanes() - __width) << 8u) | 0x1f);
}
diff --git a/libc/test/integration/src/__support/GPU/scan_reduce.cpp b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
index bc621c3300cbe..1d50e1f99bf31 100644
--- a/libc/test/integration/src/__support/GPU/scan_reduce.cpp
+++ b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
@@ -53,10 +53,59 @@ static void test_scan() {
EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_id() / 2 + 1 : 0);
}
+static uint32_t random(uint64_t *rand_next) {
+ uint64_t x = *rand_next;
+ x ^= x >> 12;
+ x ^= x << 25;
+ x ^= x >> 27;
+ *rand_next = x;
+ return static_cast<uint32_t>((x * 0x2545F4914F6CDD1Dul) >> 32);
+}
+
+// Scan operations can break down under thread divergence, make sure that the
+// function works under some random divergence. We do this by trivially
+// implementing a scan with shared scratch memory and then comparing the
+// results.
+static void test_scan_divergent() {
+ static uint32_t input[64] = {0};
+ static uint32_t result[64] = {0};
+ uint64_t state = gpu::processor_clock() + __gpu_lane_id();
+
+ for (int i = 0; i < 64; ++i) {
+ uint64_t lanemask = gpu::get_lane_mask();
+ if (random(&state) & (1ull << gpu::get_lane_id())) {
+ uint64_t divergent = gpu::get_lane_mask();
+ uint32_t value = random(&state) % 256;
+ input[gpu::get_lane_id()] = value;
+
+ if (gpu::is_first_lane(divergent)) {
+ uint32_t accumulator = 0;
+ for (uint32_t lane = 0; lane < gpu::get_lane_size(); ++lane) {
+ uint32_t tmp = input[lane];
+ result[lane] = tmp + accumulator;
+ accumulator += tmp;
+ }
+ }
+ gpu::sync_lane(divergent);
+
+ uint32_t scan = gpu::scan(divergent, value);
+ EXPECT_EQ(scan, result[gpu::get_lane_id()]);
+ }
+ if (gpu::is_first_lane(lanemask))
+ __builtin_memset(input, 0, sizeof(input));
+ gpu::sync_lane(lanemask);
+ }
+}
+
TEST_MAIN(int argc, char **argv, char **envp) {
+ if (gpu::get_thread_id() >= gpu::get_lane_size())
+ return 0;
+
test_reduce();
test_scan();
+ test_scan_divergent();
+
return 0;
}
``````````
</details>
https://github.com/llvm/llvm-project/pull/128085
More information about the llvm-branch-commits
mailing list