[libc-commits] [libc] [libc] Polish GPU benchmarking (PR #153900)

Leandro Lacerda via libc-commits libc-commits at lists.llvm.org
Fri Aug 15 16:17:47 PDT 2025


https://github.com/leandrolcampos created https://github.com/llvm/llvm-project/pull/153900

This patch provides cleanups and improvements for the GPU benchmarking infrastructure. The key changes are:

- Fix benchmark convergence bug: Round up the scaled iteration count (ceil) to ensure it grows properly. The previous truncation logic causes the iteration count to get stuck.
- Resolve remaining compiler warning.
- Remove unused `BenchmarkLogger` files: This is dead code that added maintenance and cognitive overhead without providing functionality.
- Improve build hygiene: Clean up headers and CMake dependencies to strictly follow the 'include what you use' (IWYU) principle.

>From 45e974be1a4049eb84dfbd7a9f426293e46149e6 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Fri, 15 Aug 2025 09:53:12 -0300
Subject: [PATCH 1/4] Clean up headers and CMake deps

---
 libc/benchmarks/gpu/CMakeLists.txt               | 15 ++++-----------
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp         | 10 ++++++----
 libc/benchmarks/gpu/LibcGpuBenchmark.h           |  4 +---
 libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt |  3 ++-
 libc/benchmarks/gpu/timing/amdgpu/timing.h       |  1 -
 libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt  |  3 ++-
 libc/benchmarks/gpu/timing/nvptx/timing.h        |  2 --
 7 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index beedac78d4826..cfd458552e96c 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -38,31 +38,24 @@ add_unittest_framework_library(
   SRCS
     LibcGpuBenchmark.cpp
     LibcGpuBenchmarkMain.cpp
-    BenchmarkLogger.cpp
   HDRS
     LibcGpuBenchmark.h
-    BenchmarkLogger.h
   DEPENDS
+    libc.benchmarks.gpu.timing.timing
     libc.hdr.stdint_proxy
-    libc.src.__support.big_int
-    libc.src.__support.c_string
     libc.src.__support.CPP.string
     libc.src.__support.CPP.string_view
     libc.src.__support.CPP.type_traits
-    libc.src.__support.CPP.limits
     libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.atomic
     libc.src.__support.CPP.array
-    libc.src.__support.fixed_point.fx_rep
-    libc.src.__support.macros.properties.types
-    libc.src.__support.OSUtil.osutil
-    libc.src.__support.uint128
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.fixedvector
-    libc.src.time.clock
-    libc.benchmarks.gpu.timing.timing
+    libc.src.__support.GPU.utils
+    libc.src.__support.time.gpu.time_utils
     libc.src.stdio.printf
+    libc.src.time.clock
 )
 
 add_subdirectory(src)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index ef816c51a87d7..50612e8d571a4 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -2,7 +2,6 @@
 
 #include "hdr/stdint_proxy.h"
 #include "src/__support/CPP/algorithm.h"
-#include "src/__support/CPP/array.h"
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/string.h"
 #include "src/__support/FPUtil/FPBits.h"
@@ -12,6 +11,7 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/time/gpu/time_utils.h"
 #include "src/stdio/printf.h"
+#include "src/time/clock.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace benchmarks {
@@ -136,9 +136,11 @@ void print_results(Benchmark *b) {
   LIBC_NAMESPACE::printf(
       "%-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n",
       b->get_test_name().data(), final_result.cycles,
-      final_result.standard_deviation, (unsigned long long)final_result.min,
-      (unsigned long long)final_result.max,
-      (unsigned long long)final_result.total_iterations, (unsigned)num_threads);
+      final_result.standard_deviation,
+      static_cast<unsigned long long>(final_result.min),
+      static_cast<unsigned long long>(final_result.max),
+      static_cast<unsigned long long>(final_result.total_iterations),
+      static_cast<unsigned>(num_threads));
 }
 
 void print_header() {
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 60f69edf86556..e36e93c7efc18 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -1,18 +1,16 @@
 #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
 #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
 
-#include "benchmarks/gpu/BenchmarkLogger.h"
 #include "benchmarks/gpu/timing/timing.h"
+
 #include "hdr/stdint_proxy.h"
 #include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/sqrt.h"
 #include "src/__support/macros/config.h"
-#include "src/time/clock.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
index d6a89d04dab97..f85152e69c346 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -4,10 +4,11 @@ add_header_library(
     timing.h
   DEPENDS
     libc.hdr.stdint_proxy
-    libc.src.__support.common
     libc.src.__support.macros.config
     libc.src.__support.macros.attributes
     libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.array
+    libc.src.__support.CPP.atomic
     libc.src.__support.CPP.type_traits
+    libc.src.__support.GPU.utils
 )
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index de721a2d6ce6b..b4a174f729817 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -15,7 +15,6 @@
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
-#include "src/__support/common.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index 801080e7a6e98..4615f53e3d247 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -4,10 +4,11 @@ add_header_library(
     timing.h
   DEPENDS
     libc.hdr.stdint_proxy
-    libc.src.__support.common
     libc.src.__support.macros.config
     libc.src.__support.macros.attributes
     libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.array
+    libc.src.__support.CPP.atomic
     libc.src.__support.CPP.type_traits
+    libc.src.__support.GPU.utils
 )
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 133032ca08423..d0c8bd1f9ee01 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -13,9 +13,7 @@
 #include "src/__support/CPP/algorithm.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/atomic.h"
-#include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
-#include "src/__support/common.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 

>From 607f4e7677d64c06c9c87514e0adaad5fcfc1c2d Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Fri, 15 Aug 2025 09:54:38 -0300
Subject: [PATCH 2/4] Remove dead `BenchmarkLogger`

---
 libc/benchmarks/gpu/BenchmarkLogger.cpp | 97 -------------------------
 libc/benchmarks/gpu/BenchmarkLogger.h   | 29 --------
 2 files changed, 126 deletions(-)
 delete mode 100644 libc/benchmarks/gpu/BenchmarkLogger.cpp
 delete mode 100644 libc/benchmarks/gpu/BenchmarkLogger.h

diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
deleted file mode 100644
index d5996a74f6dd7..0000000000000
--- a/libc/benchmarks/gpu/BenchmarkLogger.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-#include "benchmarks/gpu/BenchmarkLogger.h"
-#include "hdr/stdint_proxy.h"
-#include "src/__support/CPP/string.h"
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/OSUtil/io.h"               // write_to_stderr
-#include "src/__support/big_int.h"                 // is_big_int
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
-#include "src/__support/uint128.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace benchmarks {
-
-// cpp::string_view specialization
-template <>
-BenchmarkLogger &
-    BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) {
-  LIBC_NAMESPACE::write_to_stderr(str);
-  return *this;
-}
-
-// cpp::string specialization
-template <>
-BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) {
-  return *this << static_cast<cpp::string_view>(str);
-}
-
-// const char* specialization
-template <>
-BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) {
-  return *this << cpp::string_view(str);
-}
-
-// char* specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<< <char *>(char *str) {
-  return *this << cpp::string_view(str);
-}
-
-// char specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) {
-  return *this << cpp::string_view(&ch, 1);
-}
-
-// bool specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) {
-  return *this << (cond ? "true" : "false");
-}
-
-// void * specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) {
-  return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr));
-}
-
-template <typename T> BenchmarkLogger &BenchmarkLogger::operator<<(T t) {
-  if constexpr (is_big_int_v<T> ||
-                (cpp::is_integral_v<T> && cpp::is_unsigned_v<T> &&
-                 (sizeof(T) > sizeof(uint64_t)))) {
-    static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt");
-    const IntegerToString<T, radix::Hex::WithPrefix> buffer(t);
-    return *this << buffer.view();
-  } else {
-    return *this << cpp::to_string(t);
-  }
-}
-
-// is_integral specializations
-// char is already specialized to handle character
-template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short);
-template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int);
-template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long);
-template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long);
-template BenchmarkLogger &
-    BenchmarkLogger::operator<< <unsigned char>(unsigned char);
-template BenchmarkLogger &
-    BenchmarkLogger::operator<< <unsigned short>(unsigned short);
-template BenchmarkLogger &
-    BenchmarkLogger::operator<< <unsigned int>(unsigned int);
-template BenchmarkLogger &
-    BenchmarkLogger::operator<< <unsigned long>(unsigned long);
-template BenchmarkLogger &
-    BenchmarkLogger::operator<< <unsigned long long>(unsigned long long);
-
-#ifdef LIBC_TYPES_HAS_INT128
-template BenchmarkLogger &
-    BenchmarkLogger::operator<< <__uint128_t>(__uint128_t);
-#endif // LIBC_TYPES_HAS_INT128
-template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>);
-template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>);
-template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<256>>(UInt<256>);
-template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>);
-
-// TODO: Add floating point formatting once it's supported by StringStream.
-
-BenchmarkLogger log;
-
-} // namespace benchmarks
-} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h
deleted file mode 100644
index 2b22aba085f86..0000000000000
--- a/libc/benchmarks/gpu/BenchmarkLogger.h
+++ /dev/null
@@ -1,29 +0,0 @@
-//===-- Utilities to log to standard output during tests --------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
-#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
-
-#include "src/__support/macros/config.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace benchmarks {
-
-// A class to log to standard output in the context of hermetic tests.
-struct BenchmarkLogger {
-  constexpr BenchmarkLogger() = default;
-  template <typename T> BenchmarkLogger &operator<<(T);
-};
-
-// A global TestLogger instance to be used in tests.
-extern BenchmarkLogger log;
-
-} // namespace benchmarks
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */

>From 6642c9ab159d916da3d72521d74b080dfef9ba4c Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Fri, 15 Aug 2025 10:04:58 -0300
Subject: [PATCH 3/4] Fix precision-loss warning in NVPTX version of
 `latency()`

---
 libc/benchmarks/gpu/timing/nvptx/timing.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index d0c8bd1f9ee01..0c93a67129b8d 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -64,7 +64,7 @@ template <typename F, typename T>
   uint64_t stop = gpu::processor_clock();
   cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
-  volatile T output = result;
+  volatile auto output = result;
 
   // Return the time elapsed.
   return stop - start;

>From 71dd5685e22577e781b2b56dd1e832646149f687 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Fri, 15 Aug 2025 19:31:29 -0300
Subject: [PATCH 4/4] Round up scaled iteration count to ensure growth

---
 libc/benchmarks/gpu/CMakeLists.txt       | 1 +
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index cfd458552e96c..6ca134b12a479 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -50,6 +50,7 @@ add_unittest_framework_library(
     libc.src.__support.CPP.atomic
     libc.src.__support.CPP.array
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.nearest_integer_operations
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.fixedvector
     libc.src.__support.GPU.utils
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 50612e8d571a4..a4a0ff4ec46e5 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -5,6 +5,7 @@
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/string.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/FPUtil/sqrt.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/fixedvector.h"
@@ -134,7 +135,7 @@ void print_results(Benchmark *b) {
   cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
 
   LIBC_NAMESPACE::printf(
-      "%-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n",
+      "%-24s |%15.0f |%9.0f |%8llu |%8llu |%15llu |%9u |\n",
       b->get_test_name().data(), final_result.cycles,
       final_result.standard_deviation,
       static_cast<unsigned long long>(final_result.min),
@@ -149,7 +150,7 @@ void print_header() {
                          benchmarks[0]->get_suite_name().data());
   LIBC_NAMESPACE::printf("%s", RESET);
   cpp::string titles = "Benchmark                |  Cycles (Mean) |   Stddev | "
-                       "    Min |     Max | Iterations |  Threads |\n";
+                       "    Min |     Max |     Iterations |  Threads |\n";
   LIBC_NAMESPACE::printf(titles.data());
 
   cpp::string separator(titles.size(), '-');
@@ -228,7 +229,8 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
         change_ratio < options.epsilon)
       break;
 
-    iterations = static_cast<uint32_t>(iterations * options.scaling_factor);
+    iterations = static_cast<uint32_t>(
+        fputil::ceil(iterations * options.scaling_factor));
   }
 
   const auto &estimator = rep.get_estimator();



More information about the libc-commits mailing list