[libc-commits] [libc] [libc][gpu] Disable loop unrolling in the throughput benchmark loop (PR #153971)

Leandro Lacerda via libc-commits libc-commits at lists.llvm.org
Sat Aug 16 13:06:36 PDT 2025


https://github.com/leandrolcampos updated https://github.com/llvm/llvm-project/pull/153971

>From c227a4b5a7d30a6e11ba5ac4ad50229e1f3e8149 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Sat, 16 Aug 2025 15:14:08 -0300
Subject: [PATCH 1/2] Disable loop unrolling in the throughput benchmark loop
 by default

---
 libc/benchmarks/gpu/CMakeLists.txt         |  9 +++++++++
 libc/benchmarks/gpu/timing/amdgpu/timing.h | 16 ++++++++++++++++
 libc/benchmarks/gpu/timing/nvptx/timing.h  | 16 ++++++++++++++++
 3 files changed, 41 insertions(+)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 6ca134b12a479..9e57d8e4590d6 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -2,6 +2,8 @@ add_subdirectory(timing)
 
 add_custom_target(gpu-benchmark)
 
+option(LIBC_GPU_BENCHMARKS_ALLOW_UNROLL "Allow compiler loop unrolling in throughput loops" OFF)
+
 function(add_benchmark benchmark_name)
   cmake_parse_arguments(
     "BENCHMARK"
@@ -14,6 +16,12 @@ function(add_benchmark benchmark_name)
   if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
     message(FATAL_ERROR "target does not support clock")
   endif()
+
+  set(benchmark_extra_flags "")
+  if(NOT LIBC_GPU_BENCHMARKS_ALLOW_UNROLL)
+    list(APPEND benchmark_extra_flags "-DLIBC_GPU_BENCHMARKS_DISABLE_UNROLL=1")
+  endif()
+
   add_libc_hermetic(
     ${benchmark_name}
     IS_GPU_BENCHMARK
@@ -26,6 +34,7 @@ function(add_benchmark benchmark_name)
     ${BENCHMARK_UNPARSED_ARGUMENTS}
     COMPILE_OPTIONS
       -flto
+      ${benchmark_extra_flags}
   )
   get_fq_target_name(${benchmark_name} fq_target_name)
   set(fq_build_target_name ${fq_target_name}.__build__)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index b4a174f729817..5c1d3a0582d45 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -117,6 +117,10 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
   asm("" ::"s"(start));
 
   T result{};
+
+  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+  #pragma clang loop unroll(disable)
+  #endif
   for (auto input : inputs) {
     asm("" ::"v"(input));
     result = input;
@@ -146,6 +150,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
   asm("" ::"s"(start));
 
   T result{};
+
+  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+  #pragma clang loop unroll(disable)
+  #endif
   for (auto input : inputs) {
     asm("" ::"v"(input));
     result = f(input);
@@ -174,6 +182,10 @@ static LIBC_INLINE uint64_t throughput_baseline(
   asm("" ::"s"(start));
 
   T result{};
+
+  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+  #pragma clang loop unroll(disable)
+  #endif
   for (size_t i = 0; i < N; i++) {
     T x = inputs1[i];
     T y = inputs2[i];
@@ -206,6 +218,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
   asm("" ::"s"(start));
 
   T result{};
+
+  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+  #pragma clang loop unroll(disable)
+  #endif
   for (size_t i = 0; i < N; i++) {
     T x = inputs1[i];
     T y = inputs2[i];
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 0c93a67129b8d..e671e378c9e2e 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -106,6 +106,10 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
   asm("" ::"llr"(start));
 
   T result{};
+
+  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+  #pragma clang loop unroll(disable)
+  #endif
   for (auto input : inputs) {
     asm("" ::"r"(input));
     result = input;
@@ -135,6 +139,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
   asm("" ::"llr"(start));
 
   T result{};
+
+  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+  #pragma clang loop unroll(disable)
+  #endif
   for (auto input : inputs) {
     asm("" ::"r"(input));
     result = f(input);
@@ -163,6 +171,10 @@ static LIBC_INLINE uint64_t throughput_baseline(
   asm("" ::"llr"(start));
 
   T result{};
+
+  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+  #pragma clang loop unroll(disable)
+  #endif
   for (size_t i = 0; i < N; i++) {
     T x = inputs1[i];
     T y = inputs2[i];
@@ -195,6 +207,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
   asm("" ::"llr"(start));
 
   T result{};
+
+  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
+  #pragma clang loop unroll(disable)
+  #endif
   for (size_t i = 0; i < N; i++) {
     T x = inputs1[i];
     T y = inputs2[i];

>From 4da8053914c8136d9120c57b8300338545639157 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Sat, 16 Aug 2025 17:04:08 -0300
Subject: [PATCH 2/2] Remove unroll toggle; make `throughput` loop non-unrolled
 unconditionally

---
 libc/benchmarks/gpu/CMakeLists.txt         |  9 ---------
 libc/benchmarks/gpu/timing/amdgpu/timing.h | 16 ++++------------
 libc/benchmarks/gpu/timing/nvptx/timing.h  | 16 ++++------------
 3 files changed, 8 insertions(+), 33 deletions(-)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 9e57d8e4590d6..6ca134b12a479 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -2,8 +2,6 @@ add_subdirectory(timing)
 
 add_custom_target(gpu-benchmark)
 
-option(LIBC_GPU_BENCHMARKS_ALLOW_UNROLL "Allow compiler loop unrolling in throughput loops" OFF)
-
 function(add_benchmark benchmark_name)
   cmake_parse_arguments(
     "BENCHMARK"
@@ -16,12 +14,6 @@ function(add_benchmark benchmark_name)
   if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
     message(FATAL_ERROR "target does not support clock")
   endif()
-
-  set(benchmark_extra_flags "")
-  if(NOT LIBC_GPU_BENCHMARKS_ALLOW_UNROLL)
-    list(APPEND benchmark_extra_flags "-DLIBC_GPU_BENCHMARKS_DISABLE_UNROLL=1")
-  endif()
-
   add_libc_hermetic(
     ${benchmark_name}
     IS_GPU_BENCHMARK
@@ -34,7 +26,6 @@ function(add_benchmark benchmark_name)
     ${BENCHMARK_UNPARSED_ARGUMENTS}
     COMPILE_OPTIONS
       -flto
-      ${benchmark_extra_flags}
   )
   get_fq_target_name(${benchmark_name} fq_target_name)
   set(fq_build_target_name ${fq_target_name}.__build__)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 5c1d3a0582d45..8b92584b39230 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -118,9 +118,7 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
 
   T result{};
 
-  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
-  #pragma clang loop unroll(disable)
-  #endif
+#pragma clang loop unroll(disable)
   for (auto input : inputs) {
     asm("" ::"v"(input));
     result = input;
@@ -151,9 +149,7 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
 
   T result{};
 
-  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
-  #pragma clang loop unroll(disable)
-  #endif
+#pragma clang loop unroll(disable)
   for (auto input : inputs) {
     asm("" ::"v"(input));
     result = f(input);
@@ -183,9 +179,7 @@ static LIBC_INLINE uint64_t throughput_baseline(
 
   T result{};
 
-  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
-  #pragma clang loop unroll(disable)
-  #endif
+#pragma clang loop unroll(disable)
   for (size_t i = 0; i < N; i++) {
     T x = inputs1[i];
     T y = inputs2[i];
@@ -219,9 +213,7 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
 
   T result{};
 
-  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
-  #pragma clang loop unroll(disable)
-  #endif
+#pragma clang loop unroll(disable)
   for (size_t i = 0; i < N; i++) {
     T x = inputs1[i];
     T y = inputs2[i];
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index e671e378c9e2e..944d3732eae65 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -107,9 +107,7 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
 
   T result{};
 
-  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
-  #pragma clang loop unroll(disable)
-  #endif
+#pragma clang loop unroll(disable)
   for (auto input : inputs) {
     asm("" ::"r"(input));
     result = input;
@@ -140,9 +138,7 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
 
   T result{};
 
-  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
-  #pragma clang loop unroll(disable)
-  #endif
+#pragma clang loop unroll(disable)
   for (auto input : inputs) {
     asm("" ::"r"(input));
     result = f(input);
@@ -172,9 +168,7 @@ static LIBC_INLINE uint64_t throughput_baseline(
 
   T result{};
 
-  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
-  #pragma clang loop unroll(disable)
-  #endif
+#pragma clang loop unroll(disable)
   for (size_t i = 0; i < N; i++) {
     T x = inputs1[i];
     T y = inputs2[i];
@@ -208,9 +202,7 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
 
   T result{};
 
-  #if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
-  #pragma clang loop unroll(disable)
-  #endif
+#pragma clang loop unroll(disable)
   for (size_t i = 0; i < N; i++) {
     T x = inputs1[i];
     T y = inputs2[i];



More information about the libc-commits mailing list