[libc-commits] [libc] [libc][gpu] Disable loop unrolling in the throughput benchmark loop by default (PR #153971)
via libc-commits
libc-commits at lists.llvm.org
Sat Aug 16 11:46:30 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-libc
Author: Leandro Lacerda (leandrolcampos)
<details>
<summary>Changes</summary>
This patch makes GPU throughput benchmark results more comparable across targets by disabling loop unrolling in the benchmark loop by default. It also adds an opt-in switch for users who want to study instruction-level parallelism (ILP) effects.
Motivation:
* PTX (post-LTO) evidence on NVPTX: for libc `sin`, the generated PTX shows the `throughput` loop unrolled 8x at `N=128` (one iteration advances the input pointer by 64 bytes = 8 doubles), interleaving eight independent chains before the back-edge. This hides latency and significantly reduces cycles/call as the batch size `N` grows.
* Observed scaling (NVPTX measurements): with unrolling enabled, `sin` dropped from ~3,100 cycles/call at `N=1` to ~360 at `N=128`. After enforcing `#pragma clang loop unroll(disable)`, results stabilized (e.g., from ~3100 cycles/call at `N=1` to ~2700 at `N=128`).
* libdevice contrast: the libdevice `sin` path did not exhibit a similar drop in our measurements, and the PTX appears as compact internal calls rather than a long FMA chain, leaving less ILP for the outer loop to extract.
What this change does:
* Applies `#pragma clang loop unroll(disable)` to the GPU `throughput()` loop in both NVPTX and AMDGPU backends.
* Adds a build switch to re-enable unrolling for ILP studies: `LIBC_GPU_BENCHMARKS_ALLOW_UNROLL` (default is `OFF`)
Leaving unrolling entirely to the optimizer makes apples-to-apples comparisons uneven (e.g., libc vs. vendor). Disabling unrolling by default yields fairer, more consistent numbers; users can still opt in to unrolling to probe peak ILP.
---
Full diff: https://github.com/llvm/llvm-project/pull/153971.diff
3 Files Affected:
- (modified) libc/benchmarks/gpu/CMakeLists.txt (+9)
- (modified) libc/benchmarks/gpu/timing/amdgpu/timing.h (+16)
- (modified) libc/benchmarks/gpu/timing/nvptx/timing.h (+16)
``````````diff
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 6ca134b12a479..9e57d8e4590d6 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -2,6 +2,8 @@ add_subdirectory(timing)
add_custom_target(gpu-benchmark)
+option(LIBC_GPU_BENCHMARKS_ALLOW_UNROLL "Allow compiler loop unrolling in throughput loops" OFF)
+
function(add_benchmark benchmark_name)
cmake_parse_arguments(
"BENCHMARK"
@@ -14,6 +16,12 @@ function(add_benchmark benchmark_name)
if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
message(FATAL_ERROR "target does not support clock")
endif()
+
+ set(benchmark_extra_flags "")
+ if(NOT LIBC_GPU_BENCHMARKS_ALLOW_UNROLL)
+ list(APPEND benchmark_extra_flags "-DLIBC_GPU_BENCHMARKS_DISABLE_UNROLL=1")
+ endif()
+
add_libc_hermetic(
${benchmark_name}
IS_GPU_BENCHMARK
@@ -26,6 +34,7 @@ function(add_benchmark benchmark_name)
${BENCHMARK_UNPARSED_ARGUMENTS}
COMPILE_OPTIONS
-flto
+ ${benchmark_extra_flags}
)
get_fq_target_name(${benchmark_name} fq_target_name)
set(fq_build_target_name ${fq_target_name}.__build__)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index b4a174f729817..5c1d3a0582d45 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -117,6 +117,10 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
asm("" ::"s"(start));
T result{};
+
+ #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+ #pragma clang loop unroll(disable)
+ #endif
for (auto input : inputs) {
asm("" ::"v"(input));
result = input;
@@ -146,6 +150,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
asm("" ::"s"(start));
T result{};
+
+ #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+ #pragma clang loop unroll(disable)
+ #endif
for (auto input : inputs) {
asm("" ::"v"(input));
result = f(input);
@@ -174,6 +182,10 @@ static LIBC_INLINE uint64_t throughput_baseline(
asm("" ::"s"(start));
T result{};
+
+ #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+ #pragma clang loop unroll(disable)
+ #endif
for (size_t i = 0; i < N; i++) {
T x = inputs1[i];
T y = inputs2[i];
@@ -206,6 +218,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
asm("" ::"s"(start));
T result{};
+
+ #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+ #pragma clang loop unroll(disable)
+ #endif
for (size_t i = 0; i < N; i++) {
T x = inputs1[i];
T y = inputs2[i];
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 0c93a67129b8d..e671e378c9e2e 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -106,6 +106,10 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
asm("" ::"llr"(start));
T result{};
+
+ #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+ #pragma clang loop unroll(disable)
+ #endif
for (auto input : inputs) {
asm("" ::"r"(input));
result = input;
@@ -135,6 +139,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
asm("" ::"llr"(start));
T result{};
+
+ #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+ #pragma clang loop unroll(disable)
+ #endif
for (auto input : inputs) {
asm("" ::"r"(input));
result = f(input);
@@ -163,6 +171,10 @@ static LIBC_INLINE uint64_t throughput_baseline(
asm("" ::"llr"(start));
T result{};
+
+ #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+ #pragma clang loop unroll(disable)
+ #endif
for (size_t i = 0; i < N; i++) {
T x = inputs1[i];
T y = inputs2[i];
@@ -195,6 +207,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
asm("" ::"llr"(start));
T result{};
+
+ #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+ #pragma clang loop unroll(disable)
+ #endif
for (size_t i = 0; i < N; i++) {
T x = inputs1[i];
T y = inputs2[i];
``````````
</details>
https://github.com/llvm/llvm-project/pull/153971
More information about the libc-commits
mailing list