[libc-commits] [libc] [libc] Polish GPU benchmarking (PR #153900)
Leandro Lacerda via libc-commits
libc-commits at lists.llvm.org
Fri Aug 15 16:17:47 PDT 2025
https://github.com/leandrolcampos created https://github.com/llvm/llvm-project/pull/153900
This patch provides cleanups and improvements for the GPU benchmarking infrastructure. The key changes are:
- Fix benchmark convergence bug: Round up the scaled iteration count (ceil) to ensure it grows properly. The previous truncation logic causes the iteration count to get stuck.
- Resolve remaining compiler warning.
- Remove unused `BenchmarkLogger` files: This is dead code that added maintenance and cognitive overhead without providing functionality.
- Improve build hygiene: Clean up headers and CMake dependencies to strictly follow the 'include what you use' (IWYU) principle.
>From 45e974be1a4049eb84dfbd7a9f426293e46149e6 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Fri, 15 Aug 2025 09:53:12 -0300
Subject: [PATCH 1/4] Clean up headers and CMake deps
---
libc/benchmarks/gpu/CMakeLists.txt | 15 ++++-----------
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 10 ++++++----
libc/benchmarks/gpu/LibcGpuBenchmark.h | 4 +---
libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt | 3 ++-
libc/benchmarks/gpu/timing/amdgpu/timing.h | 1 -
libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt | 3 ++-
libc/benchmarks/gpu/timing/nvptx/timing.h | 2 --
7 files changed, 15 insertions(+), 23 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index beedac78d4826..cfd458552e96c 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -38,31 +38,24 @@ add_unittest_framework_library(
SRCS
LibcGpuBenchmark.cpp
LibcGpuBenchmarkMain.cpp
- BenchmarkLogger.cpp
HDRS
LibcGpuBenchmark.h
- BenchmarkLogger.h
DEPENDS
+ libc.benchmarks.gpu.timing.timing
libc.hdr.stdint_proxy
- libc.src.__support.big_int
- libc.src.__support.c_string
libc.src.__support.CPP.string
libc.src.__support.CPP.string_view
libc.src.__support.CPP.type_traits
- libc.src.__support.CPP.limits
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.atomic
libc.src.__support.CPP.array
- libc.src.__support.fixed_point.fx_rep
- libc.src.__support.macros.properties.types
- libc.src.__support.OSUtil.osutil
- libc.src.__support.uint128
libc.src.__support.FPUtil.fp_bits
libc.src.__support.FPUtil.sqrt
libc.src.__support.fixedvector
- libc.src.time.clock
- libc.benchmarks.gpu.timing.timing
+ libc.src.__support.GPU.utils
+ libc.src.__support.time.gpu.time_utils
libc.src.stdio.printf
+ libc.src.time.clock
)
add_subdirectory(src)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index ef816c51a87d7..50612e8d571a4 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -2,7 +2,6 @@
#include "hdr/stdint_proxy.h"
#include "src/__support/CPP/algorithm.h"
-#include "src/__support/CPP/array.h"
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/string.h"
#include "src/__support/FPUtil/FPBits.h"
@@ -12,6 +11,7 @@
#include "src/__support/macros/config.h"
#include "src/__support/time/gpu/time_utils.h"
#include "src/stdio/printf.h"
+#include "src/time/clock.h"
namespace LIBC_NAMESPACE_DECL {
namespace benchmarks {
@@ -136,9 +136,11 @@ void print_results(Benchmark *b) {
LIBC_NAMESPACE::printf(
"%-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n",
b->get_test_name().data(), final_result.cycles,
- final_result.standard_deviation, (unsigned long long)final_result.min,
- (unsigned long long)final_result.max,
- (unsigned long long)final_result.total_iterations, (unsigned)num_threads);
+ final_result.standard_deviation,
+ static_cast<unsigned long long>(final_result.min),
+ static_cast<unsigned long long>(final_result.max),
+ static_cast<unsigned long long>(final_result.total_iterations),
+ static_cast<unsigned>(num_threads));
}
void print_header() {
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 60f69edf86556..e36e93c7efc18 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -1,18 +1,16 @@
#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
-#include "benchmarks/gpu/BenchmarkLogger.h"
#include "benchmarks/gpu/timing/timing.h"
+
#include "hdr/stdint_proxy.h"
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
-#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/sqrt.h"
#include "src/__support/macros/config.h"
-#include "src/time/clock.h"
namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
index d6a89d04dab97..f85152e69c346 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -4,10 +4,11 @@ add_header_library(
timing.h
DEPENDS
libc.hdr.stdint_proxy
- libc.src.__support.common
libc.src.__support.macros.config
libc.src.__support.macros.attributes
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.array
+ libc.src.__support.CPP.atomic
libc.src.__support.CPP.type_traits
+ libc.src.__support.GPU.utils
)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index de721a2d6ce6b..b4a174f729817 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -15,7 +15,6 @@
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
-#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index 801080e7a6e98..4615f53e3d247 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -4,10 +4,11 @@ add_header_library(
timing.h
DEPENDS
libc.hdr.stdint_proxy
- libc.src.__support.common
libc.src.__support.macros.config
libc.src.__support.macros.attributes
libc.src.__support.CPP.algorithm
libc.src.__support.CPP.array
+ libc.src.__support.CPP.atomic
libc.src.__support.CPP.type_traits
+ libc.src.__support.GPU.utils
)
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 133032ca08423..d0c8bd1f9ee01 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -13,9 +13,7 @@
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/atomic.h"
-#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
-#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
>From 607f4e7677d64c06c9c87514e0adaad5fcfc1c2d Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Fri, 15 Aug 2025 09:54:38 -0300
Subject: [PATCH 2/4] Remove dead `BenchmarkLogger`
---
libc/benchmarks/gpu/BenchmarkLogger.cpp | 97 -------------------------
libc/benchmarks/gpu/BenchmarkLogger.h | 29 --------
2 files changed, 126 deletions(-)
delete mode 100644 libc/benchmarks/gpu/BenchmarkLogger.cpp
delete mode 100644 libc/benchmarks/gpu/BenchmarkLogger.h
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
deleted file mode 100644
index d5996a74f6dd7..0000000000000
--- a/libc/benchmarks/gpu/BenchmarkLogger.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-#include "benchmarks/gpu/BenchmarkLogger.h"
-#include "hdr/stdint_proxy.h"
-#include "src/__support/CPP/string.h"
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/OSUtil/io.h" // write_to_stderr
-#include "src/__support/big_int.h" // is_big_int
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
-#include "src/__support/uint128.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace benchmarks {
-
-// cpp::string_view specialization
-template <>
-BenchmarkLogger &
- BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) {
- LIBC_NAMESPACE::write_to_stderr(str);
- return *this;
-}
-
-// cpp::string specialization
-template <>
-BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) {
- return *this << static_cast<cpp::string_view>(str);
-}
-
-// const char* specialization
-template <>
-BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) {
- return *this << cpp::string_view(str);
-}
-
-// char* specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<< <char *>(char *str) {
- return *this << cpp::string_view(str);
-}
-
-// char specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) {
- return *this << cpp::string_view(&ch, 1);
-}
-
-// bool specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) {
- return *this << (cond ? "true" : "false");
-}
-
-// void * specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) {
- return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr));
-}
-
-template <typename T> BenchmarkLogger &BenchmarkLogger::operator<<(T t) {
- if constexpr (is_big_int_v<T> ||
- (cpp::is_integral_v<T> && cpp::is_unsigned_v<T> &&
- (sizeof(T) > sizeof(uint64_t)))) {
- static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt");
- const IntegerToString<T, radix::Hex::WithPrefix> buffer(t);
- return *this << buffer.view();
- } else {
- return *this << cpp::to_string(t);
- }
-}
-
-// is_integral specializations
-// char is already specialized to handle character
-template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short);
-template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int);
-template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long);
-template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long);
-template BenchmarkLogger &
- BenchmarkLogger::operator<< <unsigned char>(unsigned char);
-template BenchmarkLogger &
- BenchmarkLogger::operator<< <unsigned short>(unsigned short);
-template BenchmarkLogger &
- BenchmarkLogger::operator<< <unsigned int>(unsigned int);
-template BenchmarkLogger &
- BenchmarkLogger::operator<< <unsigned long>(unsigned long);
-template BenchmarkLogger &
- BenchmarkLogger::operator<< <unsigned long long>(unsigned long long);
-
-#ifdef LIBC_TYPES_HAS_INT128
-template BenchmarkLogger &
- BenchmarkLogger::operator<< <__uint128_t>(__uint128_t);
-#endif // LIBC_TYPES_HAS_INT128
-template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>);
-template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>);
-template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<256>>(UInt<256>);
-template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>);
-
-// TODO: Add floating point formatting once it's supported by StringStream.
-
-BenchmarkLogger log;
-
-} // namespace benchmarks
-} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h
deleted file mode 100644
index 2b22aba085f86..0000000000000
--- a/libc/benchmarks/gpu/BenchmarkLogger.h
+++ /dev/null
@@ -1,29 +0,0 @@
-//===-- Utilities to log to standard output during tests --------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
-#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
-
-#include "src/__support/macros/config.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace benchmarks {
-
-// A class to log to standard output in the context of hermetic tests.
-struct BenchmarkLogger {
- constexpr BenchmarkLogger() = default;
- template <typename T> BenchmarkLogger &operator<<(T);
-};
-
-// A global TestLogger instance to be used in tests.
-extern BenchmarkLogger log;
-
-} // namespace benchmarks
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */
>From 6642c9ab159d916da3d72521d74b080dfef9ba4c Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Fri, 15 Aug 2025 10:04:58 -0300
Subject: [PATCH 3/4] Fix precision-loss warning in NVPTX version of
`latency()`
---
libc/benchmarks/gpu/timing/nvptx/timing.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index d0c8bd1f9ee01..0c93a67129b8d 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -64,7 +64,7 @@ template <typename F, typename T>
uint64_t stop = gpu::processor_clock();
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
asm("" ::"r"(stop));
- volatile T output = result;
+ volatile auto output = result;
// Return the time elapsed.
return stop - start;
>From 71dd5685e22577e781b2b56dd1e832646149f687 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Fri, 15 Aug 2025 19:31:29 -0300
Subject: [PATCH 4/4] Round up scaled iteration count to ensure growth
---
libc/benchmarks/gpu/CMakeLists.txt | 1 +
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 8 +++++---
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index cfd458552e96c..6ca134b12a479 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -50,6 +50,7 @@ add_unittest_framework_library(
libc.src.__support.CPP.atomic
libc.src.__support.CPP.array
libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.FPUtil.nearest_integer_operations
libc.src.__support.FPUtil.sqrt
libc.src.__support.fixedvector
libc.src.__support.GPU.utils
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 50612e8d571a4..a4a0ff4ec46e5 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -5,6 +5,7 @@
#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/string.h"
#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/NearestIntegerOperations.h"
#include "src/__support/FPUtil/sqrt.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/fixedvector.h"
@@ -134,7 +135,7 @@ void print_results(Benchmark *b) {
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
LIBC_NAMESPACE::printf(
- "%-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n",
+ "%-24s |%15.0f |%9.0f |%8llu |%8llu |%15llu |%9u |\n",
b->get_test_name().data(), final_result.cycles,
final_result.standard_deviation,
static_cast<unsigned long long>(final_result.min),
@@ -149,7 +150,7 @@ void print_header() {
benchmarks[0]->get_suite_name().data());
LIBC_NAMESPACE::printf("%s", RESET);
cpp::string titles = "Benchmark | Cycles (Mean) | Stddev | "
- " Min | Max | Iterations | Threads |\n";
+ " Min | Max | Iterations | Threads |\n";
LIBC_NAMESPACE::printf(titles.data());
cpp::string separator(titles.size(), '-');
@@ -228,7 +229,8 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
change_ratio < options.epsilon)
break;
- iterations = static_cast<uint32_t>(iterations * options.scaling_factor);
+ iterations = static_cast<uint32_t>(
+ fputil::ceil(iterations * options.scaling_factor));
}
const auto &estimator = rep.get_estimator();
More information about the libc-commits
mailing list