[libc-commits] [libc] [libc] NVPTX Profiling (PR #92009)

via libc-commits libc-commits at lists.llvm.org
Thu Jun 20 19:49:28 PDT 2024


https://github.com/jameshu15869 updated https://github.com/llvm/llvm-project/pull/92009

>From b6b47fb3b6d7560d667efb0841710740be3db714 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 11 May 2024 17:19:54 -0400
Subject: [PATCH 01/18] initial nvptx microbenchmarking infrastructure

---
 libc/benchmarks/CMakeLists.txt                | 416 +++++++++---------
 libc/benchmarks/gpu/BenchmarkLogger.cpp       |  89 ++++
 libc/benchmarks/gpu/BenchmarkLogger.h         |  27 ++
 libc/benchmarks/gpu/CMakeLists.txt            | 183 ++++++++
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp      |  70 +++
 libc/benchmarks/gpu/LibcGpuBenchmark.h        | 122 +++++
 libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp  |   6 +
 libc/benchmarks/gpu/src/CMakeLists.txt        |   2 +
 libc/benchmarks/gpu/src/ctype/CMakeLists.txt  |  21 +
 .../gpu/src/ctype/isalnum_benchmark.cpp       |  22 +
 .../gpu/src/ctype/isalpha_benchmark.cpp       |   9 +
 libc/benchmarks/gpu/src/math/CMakeLists.txt   |   0
 libc/benchmarks/gpu/timing/CMakeLists.txt     |  12 +
 .../gpu/timing/nvptx/CMakeLists.txt           |   7 +
 libc/benchmarks/gpu/timing/nvptx/timing.h     | 108 +++++
 libc/benchmarks/gpu/timing/timing.h           |  22 +
 16 files changed, 911 insertions(+), 205 deletions(-)
 create mode 100644 libc/benchmarks/gpu/BenchmarkLogger.cpp
 create mode 100644 libc/benchmarks/gpu/BenchmarkLogger.h
 create mode 100644 libc/benchmarks/gpu/CMakeLists.txt
 create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmark.cpp
 create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmark.h
 create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
 create mode 100644 libc/benchmarks/gpu/src/CMakeLists.txt
 create mode 100644 libc/benchmarks/gpu/src/ctype/CMakeLists.txt
 create mode 100644 libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
 create mode 100644 libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
 create mode 100644 libc/benchmarks/gpu/src/math/CMakeLists.txt
 create mode 100644 libc/benchmarks/gpu/timing/CMakeLists.txt
 create mode 100644 libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
 create mode 100644 libc/benchmarks/gpu/timing/nvptx/timing.h
 create mode 100644 libc/benchmarks/gpu/timing/timing.h

diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 4978da65850cc..a802e653a091e 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -1,205 +1,211 @@
-find_package(Threads)
-
-set(LLVM_LINK_COMPONENTS
-  Support
-  TargetParser
-  )
-
-#==============================================================================
-# Add Unit Testing Support
-#==============================================================================
-
-function(add_libc_benchmark_unittest target_name)
-  if(NOT LLVM_INCLUDE_TESTS)
-    return()
-  endif()
-
-  cmake_parse_arguments(
-    "LIBC_BENCHMARKS_UNITTEST"
-    "" # No optional arguments
-    "SUITE" # Single value arguments
-    "SRCS;DEPENDS" # Multi-value arguments
-    ${ARGN}
-  )
-
-  add_executable(${target_name}
-    EXCLUDE_FROM_ALL
-    ${LIBC_BENCHMARKS_UNITTEST_SRCS}
-  )
-  target_link_libraries(${target_name}
-    PRIVATE
-    llvm_gtest_main
-    llvm_gtest
-    ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
-  )
-  llvm_update_compile_flags(${target_name})
-
-  add_custom_command(
-    TARGET ${target_name}
-    POST_BUILD
-    COMMAND $<TARGET_FILE:${target_name}>
-  )
-  add_dependencies(libc-benchmark-util-tests ${target_name})
-endfunction()
-
-#==============================================================================
-# Build Google Benchmark for libc
-#==============================================================================
-
-include(ExternalProject)
-ExternalProject_Add(google-benchmark-libc
-        EXCLUDE_FROM_ALL ON
-        PREFIX google-benchmark-libc
-        SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
-        INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
-        CMAKE_CACHE_ARGS
-          -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
-          -DBENCHMARK_ENABLE_LTO:BOOL=OFF
-          -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
-          -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
-          -DBENCHMARK_FORCE_WERROR:BOOL=OFF
-          -DBENCHMARK_USE_LIBCXX:BOOL=OFF
-          -DCMAKE_BUILD_TYPE:STRING=Release
-
-          -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
-          -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
-          -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-          -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-          -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
-          -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
-
-          -DBUILD_SHARED_LIBS:BOOL=OFF
-          -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
-
-          -DCMAKE_CXX_STANDARD:STRING=14
-          -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
-        )
-
-add_custom_target(libc-benchmark-util-tests)
-
-# libc-benchmark
-add_library(libc-benchmark
-    STATIC
-    EXCLUDE_FROM_ALL
-    LibcBenchmark.cpp
-    LibcBenchmark.h
-)
-
-target_include_directories(libc-benchmark
-    PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
-)
-target_link_libraries(libc-benchmark
-    PUBLIC
-    benchmark::benchmark
-    LLVMSupport
-    LLVMTargetParser
-    Threads::Threads
-)
-add_dependencies(libc-benchmark google-benchmark-libc)
-llvm_update_compile_flags(libc-benchmark)
-
-add_libc_benchmark_unittest(libc-benchmark-test
-    SRCS LibcBenchmarkTest.cpp
-    DEPENDS libc-benchmark
-)
-
-# libc-memory-benchmark
-add_library(libc-memory-benchmark
-    STATIC
-    EXCLUDE_FROM_ALL
-    LibcMemoryBenchmark.cpp
-    LibcMemoryBenchmark.h
-    LibcFunctionPrototypes.h
-    MemorySizeDistributions.cpp
-    MemorySizeDistributions.h
-)
-target_include_directories(libc-memory-benchmark
-    PUBLIC
-    ${CMAKE_CURRENT_SOURCE_DIR}
-)
-target_link_libraries(libc-memory-benchmark
-    PUBLIC
-    libc-benchmark
-)
-llvm_update_compile_flags(libc-memory-benchmark)
-
-add_libc_benchmark_unittest(libc-memory-benchmark-test
-    SRCS LibcMemoryBenchmarkTest.cpp
-    DEPENDS libc-memory-benchmark
-)
-
-# json
-add_library(json
-    STATIC
-    EXCLUDE_FROM_ALL
-    JSON.cpp
-    JSON.h
-)
-target_link_libraries(json PUBLIC libc-memory-benchmark)
-llvm_update_compile_flags(json)
-
-add_libc_benchmark_unittest(json-test
-    SRCS JSONTest.cpp
-    DEPENDS json
-)
-
-#==============================================================================
-# Benchmarking tool
-#==============================================================================
-
-# Benchmark all implementations that can run on the target CPU.
-function(add_libc_multi_impl_benchmark name)
-  get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
-  foreach(fq_config_name IN LISTS fq_implementations)
-    get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
-    cpu_supports(can_run "${required_cpu_features}")
-    if(can_run)
-        set(benchmark_name ${fq_config_name}_benchmark)
-        add_executable(${benchmark_name}
-            EXCLUDE_FROM_ALL
-            LibcMemoryBenchmarkMain.cpp
-        )
-        get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
-        target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
-        string(TOUPPER ${name} name_upper)
-        target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
-        llvm_update_compile_flags(${benchmark_name})
-    else()
-      message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
-    endif()
-  endforeach()
-endfunction()
-
-add_libc_multi_impl_benchmark(bcmp)
-add_libc_multi_impl_benchmark(bzero)
-add_libc_multi_impl_benchmark(memcmp)
-add_libc_multi_impl_benchmark(memcpy)
-add_libc_multi_impl_benchmark(memmove)
-add_libc_multi_impl_benchmark(memset)
-
-#==============================================================================
-# Google Benchmarking tool
-#==============================================================================
-
-# This target uses the Google Benchmark facility to report throughput for llvm
-# libc memory functions compiled for the host machine. This is useful to
-# continuously monitor the performance of the memory functions.
-add_executable(libc.benchmarks.memory_functions.opt_host
-  EXCLUDE_FROM_ALL
-  LibcMemoryGoogleBenchmarkMain.cpp
-  LibcDefaultImplementations.cpp
-)
-target_link_libraries(libc.benchmarks.memory_functions.opt_host
-  PRIVATE
-  libc-memory-benchmark
-  libc.src.string.memcmp_opt_host.__internal__
-  libc.src.string.bcmp_opt_host.__internal__
-  libc.src.string.memcpy_opt_host.__internal__
-  libc.src.string.memset_opt_host.__internal__
-  libc.src.string.bzero_opt_host.__internal__
-  libc.src.string.memmove_opt_host.__internal__
-  benchmark_main
-)
-llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host)
-
-add_subdirectory(automemcpy)
+if(NOT LIBC_TARGET_OS_IS_GPU)
+	find_package(Threads)
+
+	set(LLVM_LINK_COMPONENTS
+	  Support
+	  TargetParser
+	  )
+
+	#==============================================================================
+	# Add Unit Testing Support
+	#==============================================================================
+
+	function(add_libc_benchmark_unittest target_name)
+	  if(NOT LLVM_INCLUDE_TESTS)
+	    return()
+	  endif()
+
+	  cmake_parse_arguments(
+	    "LIBC_BENCHMARKS_UNITTEST"
+	    "" # No optional arguments
+	    "SUITE" # Single value arguments
+	    "SRCS;DEPENDS" # Multi-value arguments
+	    ${ARGN}
+	  )
+
+	  add_executable(${target_name}
+	    EXCLUDE_FROM_ALL
+	    ${LIBC_BENCHMARKS_UNITTEST_SRCS}
+	  )
+	  target_link_libraries(${target_name}
+	    PRIVATE
+	    llvm_gtest_main
+	    llvm_gtest
+	    ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
+	  )
+	  llvm_update_compile_flags(${target_name})
+
+	  add_custom_command(
+	    TARGET ${target_name}
+	    POST_BUILD
+	    COMMAND $<TARGET_FILE:${target_name}>
+	  )
+	  add_dependencies(libc-benchmark-util-tests ${target_name})
+	endfunction()
+
+	#==============================================================================
+	# Build Google Benchmark for libc
+	#==============================================================================
+
+	include(ExternalProject)
+	ExternalProject_Add(google-benchmark-libc
+		EXCLUDE_FROM_ALL ON
+		PREFIX google-benchmark-libc
+		SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
+		INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
+		CMAKE_CACHE_ARGS
+		  -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
+		  -DBENCHMARK_ENABLE_LTO:BOOL=OFF
+		  -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
+		  -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
+		  -DBENCHMARK_FORCE_WERROR:BOOL=OFF
+		  -DBENCHMARK_USE_LIBCXX:BOOL=OFF
+		  -DCMAKE_BUILD_TYPE:STRING=Release
+
+		  -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
+		  -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
+		  -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
+		  -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
+		  -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
+		  -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
+
+		  -DBUILD_SHARED_LIBS:BOOL=OFF
+		  -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
+
+		  -DCMAKE_CXX_STANDARD:STRING=14
+		  -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
+		)
+
+	add_custom_target(libc-benchmark-util-tests)
+
+	# libc-benchmark
+	add_library(libc-benchmark
+	    STATIC
+	    EXCLUDE_FROM_ALL
+	    LibcBenchmark.cpp
+	    LibcBenchmark.h
+	)
+
+	target_include_directories(libc-benchmark
+	    PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
+	)
+	target_link_libraries(libc-benchmark
+	    PUBLIC
+	    benchmark::benchmark
+            LLVMSupport
+	    LLVMTargetParser
+	    Threads::Threads
+	)
+	add_dependencies(libc-benchmark google-benchmark-libc)
+	llvm_update_compile_flags(libc-benchmark)
+
+	add_libc_benchmark_unittest(libc-benchmark-test
+	    SRCS LibcBenchmarkTest.cpp
+	    DEPENDS libc-benchmark
+	)
+
+	# libc-memory-benchmark
+	add_library(libc-memory-benchmark
+	    STATIC
+	    EXCLUDE_FROM_ALL
+	    LibcMemoryBenchmark.cpp
+	    LibcMemoryBenchmark.h
+	    LibcFunctionPrototypes.h
+	    MemorySizeDistributions.cpp
+	    MemorySizeDistributions.h
+	)
+	target_include_directories(libc-memory-benchmark
+	    PUBLIC
+	    ${CMAKE_CURRENT_SOURCE_DIR}
+	)
+	target_link_libraries(libc-memory-benchmark
+	    PUBLIC
+	    libc-benchmark
+	)
+	llvm_update_compile_flags(libc-memory-benchmark)
+
+	add_libc_benchmark_unittest(libc-memory-benchmark-test
+	    SRCS LibcMemoryBenchmarkTest.cpp
+	    DEPENDS libc-memory-benchmark
+	)
+
+	# json
+	add_library(json
+	    STATIC
+	    EXCLUDE_FROM_ALL
+	    JSON.cpp
+	    JSON.h
+	)
+	target_link_libraries(json PUBLIC libc-memory-benchmark)
+	llvm_update_compile_flags(json)
+
+	add_libc_benchmark_unittest(json-test
+	    SRCS JSONTest.cpp
+	    DEPENDS json
+	)
+
+	#==============================================================================
+	# Benchmarking tool
+	#==============================================================================
+
+	# Benchmark all implementations that can run on the target CPU.
+	function(add_libc_multi_impl_benchmark name)
+	  get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
+	  foreach(fq_config_name IN LISTS fq_implementations)
+	    get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
+	    cpu_supports(can_run "${required_cpu_features}")
+	    if(can_run)
+		set(benchmark_name ${fq_config_name}_benchmark)
+		add_executable(${benchmark_name}
+		    EXCLUDE_FROM_ALL
+		    LibcMemoryBenchmarkMain.cpp
+		)
+		get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
+		target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
+		string(TOUPPER ${name} name_upper)
+		target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
+		llvm_update_compile_flags(${benchmark_name})
+	    else()
+	      message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
+	    endif()
+	  endforeach()
+	endfunction()
+
+	add_libc_multi_impl_benchmark(bcmp)
+	add_libc_multi_impl_benchmark(bzero)
+	add_libc_multi_impl_benchmark(memcmp)
+	add_libc_multi_impl_benchmark(memcpy)
+	add_libc_multi_impl_benchmark(memmove)
+	add_libc_multi_impl_benchmark(memset)
+
+	#==============================================================================
+	# Google Benchmarking tool
+	#==============================================================================
+
+	# This target uses the Google Benchmark facility to report throughput for llvm
+	# libc memory functions compiled for the host machine. This is useful to
+	# continuously monitor the performance of the memory functions.
+	add_executable(libc.benchmarks.memory_functions.opt_host
+	  EXCLUDE_FROM_ALL
+	  LibcMemoryGoogleBenchmarkMain.cpp
+	  LibcDefaultImplementations.cpp
+	)
+	target_link_libraries(libc.benchmarks.memory_functions.opt_host
+	  PRIVATE
+	  libc-memory-benchmark
+	  libc.src.string.memcmp_opt_host.__internal__
+	  libc.src.string.bcmp_opt_host.__internal__
+	  libc.src.string.memcpy_opt_host.__internal__
+	  libc.src.string.memset_opt_host.__internal__
+	  libc.src.string.bzero_opt_host.__internal__
+	  libc.src.string.memmove_opt_host.__internal__
+	  benchmark_main
+	)
+	llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host)
+
+	add_subdirectory(automemcpy)
+endif()
+
+if(LIBC_TARGET_OS_IS_GPU)
+	add_subdirectory(gpu)
+endif()
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
new file mode 100644
index 0000000000000..94a0d897c9585
--- /dev/null
+++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp
@@ -0,0 +1,89 @@
+#include "benchmarks/gpu/BenchmarkLogger.h"
+#include "src/__support/CPP/string.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/io.h"               // write_to_stderr
+#include "src/__support/big_int.h"                 // is_big_int
+#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
+#include "src/__support/uint128.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+namespace libc_gpu_benchmarks {
+
+// cpp::string_view specialization
+template <>
+BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) {
+  LIBC_NAMESPACE::write_to_stderr(str);
+  return *this;
+}
+
+// cpp::string specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) {
+  return *this << static_cast<cpp::string_view>(str);
+}
+
+// const char* specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) {
+  return *this << cpp::string_view(str);
+}
+
+// char* specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<< <char *>(char *str) {
+  return *this << cpp::string_view(str);
+}
+
+// char specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) {
+  return *this << cpp::string_view(&ch, 1);
+}
+
+// bool specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) {
+  return *this << (cond ? "true" : "false");
+}
+
+// void * specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) {
+  return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr));
+}
+
+template <typename T> BenchmarkLogger &BenchmarkLogger::operator<<(T t) {
+  if constexpr (is_big_int_v<T> ||
+                (cpp::is_integral_v<T> && cpp::is_unsigned_v<T> &&
+                 (sizeof(T) > sizeof(uint64_t)))) {
+    static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt");
+    const IntegerToString<T, radix::Hex::WithPrefix> buffer(t);
+    return *this << buffer.view();
+  } else {
+    return *this << cpp::to_string(t);
+  }
+}
+
+// is_integral specializations
+// char is already specialized to handle character
+template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short);
+template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int);
+template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long);
+template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned char>(unsigned char);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned short>(unsigned short);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned int>(unsigned int);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned long>(unsigned long);
+template BenchmarkLogger &
+    BenchmarkLogger::operator<< <unsigned long long>(unsigned long long);
+
+#ifdef LIBC_TYPES_HAS_INT128
+template BenchmarkLogger &BenchmarkLogger::operator<< <__uint128_t>(__uint128_t);
+#endif // LIBC_TYPES_HAS_INT128
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<256>>(UInt<256>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>);
+
+// TODO: Add floating point formatting once it's supported by StringStream.
+
+BenchmarkLogger blog;
+
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h
new file mode 100644
index 0000000000000..ed3cc97e59c6d
--- /dev/null
+++ b/libc/benchmarks/gpu/BenchmarkLogger.h
@@ -0,0 +1,27 @@
+//===-- Utilities to log to standard output during tests --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
+#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
+
+namespace LIBC_NAMESPACE {
+namespace libc_gpu_benchmarks {
+
+// A class to log to standard output in the context of hermetic tests.
+struct BenchmarkLogger {
+  constexpr BenchmarkLogger() = default;
+  template <typename T> BenchmarkLogger &operator<<(T);
+};
+
+// A global TestLogger instance to be used in tests.
+extern BenchmarkLogger blog;
+
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
+
+#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
new file mode 100644
index 0000000000000..a18be27e33573
--- /dev/null
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -0,0 +1,183 @@
+add_subdirectory(timing)
+
+add_custom_target(gpu-benchmark)
+
+function (add_gpu_benchmark test_name)
+  if(NOT TARGET libc.startup.${LIBC_TARGET_OS}.crt1)
+    message(VERBOSE "Skipping ${fq_target_name} as it is not available on ${LIBC_TARGET_OS}.")
+    return()
+  endif()
+
+  cmake_parse_arguments(
+    "GPU_BENCHMARK"
+    "" # No optional arguments
+    "SUITE" # Single value arguments
+    "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
+    ${ARGN}
+  )
+
+  if(NOT GPU_BENCHMARK_SUITE)
+    message(FATAL_ERROR "SUITE not specified for ${fq_target_name}")
+  endif()
+  if(NOT GPU_BENCHMARK_SRCS)
+    message(FATAL_ERROR "The SRCS list for add_gpu_benchmark is missing.")
+  endif()
+
+  get_fq_target_name(${test_name} fq_target_name)
+  get_fq_target_name(${test_name}.libc fq_libc_target_name) # Stores the compiled libc + infrastructure archive to link in
+  get_fq_deps_list(fq_deps_list ${GPU_BENCHMARK_DEPENDS})
+  list(APPEND fq_deps_list
+      # Hermetic tests use the platform's startup object. So, their deps also
+      # have to be collected.
+      libc.startup.${LIBC_TARGET_OS}.crt1
+      # We always add the memory functions objects. This is because the
+      # compiler's codegen can emit calls to the C memory functions.
+      libc.src.string.bcmp
+      libc.src.string.bzero
+      libc.src.string.memcmp
+      libc.src.string.memcpy
+      libc.src.string.memmove
+      libc.src.string.memset
+      libc.src.__support.StringUtil.error_to_string
+  )
+
+  list(REMOVE_DUPLICATES fq_deps_list)
+
+  # TODO: Instead of gathering internal object files from entrypoints,
+  # collect the object files with public names of entrypoints.
+  get_object_files_for_test(
+      link_object_files skipped_entrypoints_list ${fq_deps_list})
+  if(skipped_entrypoints_list)
+    if(LIBC_CMAKE_VERBOSE_LOGGING)
+      set(msg "Skipping hermetic test ${fq_target_name} as it has missing deps: "
+              "${skipped_entrypoints_list}.")
+    endif()
+    return()
+  endif()
+  list(REMOVE_DUPLICATES link_object_files)
+
+  # Make a library of all deps
+  add_library(
+    ${fq_target_name}.__libc__
+    STATIC
+    EXCLUDE_FROM_ALL
+    ${link_object_files}
+  )
+  set_target_properties(${fq_target_name}.__libc__
+      PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  set_target_properties(${fq_target_name}.__libc__
+      PROPERTIES ARCHIVE_OUTPUT_NAME ${fq_target_name}.libc)
+
+  set(fq_build_target_name ${fq_target_name}.__build__)
+  add_executable(
+    ${fq_build_target_name}
+    EXCLUDE_FROM_ALL
+    $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
+    ${GPU_BENCHMARK_SRCS}
+    ${GPU_BENCHMARK_HDRS}
+  )
+  set_target_properties(${fq_build_target_name}
+    PROPERTIES
+      RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+
+  _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
+  target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
+  target_compile_options(${fq_build_target_name} PRIVATE ${compile_options}) 
+
+  set(link_libraries "")
+  foreach(lib in LISTS GPU_BENCHMARK_LINK_LIBRARIES)
+    if(TARGET ${lib}.hermetic)
+      list(APPEND link_libraries ${lib}.hermetic)
+    else()
+      list(APPEND link_libraries ${lib})
+    endif()
+  endforeach()
+
+  if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+    target_link_options(${fq_build_target_name} PRIVATE
+      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -Wno-multi-gpu
+      "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
+      "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
+  elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    # We need to use the internal object versions for NVPTX.
+    set(internal_suffix ".__internal__")
+    target_link_options(${fq_build_target_name} PRIVATE
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
+      "-Wl,--suppress-stack-size-warning"
+      -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
+      "--cuda-path=${LIBC_CUDA_ROOT}")
+  elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
+    target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
+  else()
+    # Older version of gcc does not support `nostdlib++` flag.  We use
+    # `nostdlib` and link against libgcc_s, which cannot be linked statically.
+    target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib)
+    list(APPEND link_libraries ${LIBGCC_S_LOCATION})
+  endif()
+
+  # link libraries for the BUILD target (i.e. to compile the test)
+  target_link_libraries(
+    ${fq_build_target_name}
+    PRIVATE
+      libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
+      ${link_libraries}
+      # LibcTest.hermetic
+      LibcGpuBenchmark.hermetic
+      # LibcHermeticTestSupport.hermetic
+      LibcHermeticTestSupport.hermetic
+      # The NVIDIA 'nvlink' linker does not currently support static libraries.
+      $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
+
+  add_dependencies(${fq_build_target_name}
+    LibcGpuBenchmark.hermetic
+  ${fq_deps_list})
+
+  # Tests on the GPU require an external loader utility to launch the kernel.
+  if(TARGET libc.utils.gpu.loader)
+    add_dependencies(${fq_build_target_name} libc.utils.gpu.loader)
+    get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE")
+  endif()
+
+  set(test_cmd ${GPU_BENCHMARK_ENV}
+      $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${GPU_BENCHMARK_LOADER_ARGS}
+      $<TARGET_FILE:${fq_build_target_name}> ${GPU_BENCHMARK_ARGS})
+  add_custom_target(
+    ${fq_target_name}
+    COMMAND ${test_cmd}
+    COMMAND_EXPAND_LISTS
+    COMMENT "Running GPU benchmark ${fq_target_name}"
+  )
+
+  # Make this benchmark part of its suite
+  add_dependencies(${GPU_BENCHMARK_SUITE} ${fq_target_name})
+  # Remember to make this benchmark part of the umbrella command
+  add_dependencies(gpu-benchmark ${fq_target_name})
+endfunction(add_gpu_benchmark)
+
+add_unittest_framework_library(
+  LibcGpuBenchmark
+  SRCS
+    LibcGpuBenchmark.cpp
+    LibcGpuBenchmarkMain.cpp
+    BenchmarkLogger.cpp
+  HDRS
+    LibcGpuBenchmark.h
+    BenchmarkLogger.h
+  DEPENDS
+    libc.src.__support.big_int
+    libc.src.__support.c_string
+    libc.src.__support.CPP.string
+    libc.src.__support.CPP.string_view
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.fixed_point.fx_rep
+    libc.src.__support.macros.properties.types
+    libc.src.__support.OSUtil.osutil
+    libc.src.__support.uint128
+    libc.benchmarks.gpu.timing.timing
+)
+
+add_subdirectory(src)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
new file mode 100644
index 0000000000000..d37f5a0a53a70
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -0,0 +1,70 @@
+#include "LibcGpuBenchmark.h"
+
+namespace LIBC_NAMESPACE {
+namespace libc_gpu_benchmarks {
+
+Benchmark *Benchmark::Start = nullptr;
+Benchmark *Benchmark::End = nullptr;
+
+void Benchmark::addBenchmark(Benchmark *B) {
+  if (End == nullptr) {
+    Start = B;
+    End = B;
+    return;
+  }
+
+  End->Next = B;
+  End = B;
+}
+
+int Benchmark::runBenchmarks() {
+  for (Benchmark *B = Start; B != nullptr; B = B->Next) {
+    B->Run();
+  }
+
+  return 0;
+}
+
+BenchmarkResult benchmark(const BenchmarkOptions &Options,
+                          uint64_t (*WrapperFunc)()) {
+  BenchmarkResult Result;
+  RuntimeEstimationProgression REP;
+  size_t TotalIterations = 0;
+  size_t Iterations = Options.InitialIterations;
+  if (Iterations < (uint32_t)1) {
+    Iterations = 1;
+  }
+  size_t Samples = 0;
+  uint64_t BestGuess = 0;
+  uint64_t TotalCycles = 0;
+  for (;;) {
+    uint64_t SampleCycles = 0;
+    for (uint32_t i = 0; i < Iterations; i++) {
+      auto overhead = LIBC_NAMESPACE::overhead();
+      uint64_t result = WrapperFunc() - overhead;
+      SampleCycles += result;
+    }
+
+    Samples++;
+    TotalCycles += SampleCycles;
+    TotalIterations += Iterations;
+    const double ChangeRatio =
+        REP.ComputeImprovement({Iterations, SampleCycles});
+    BestGuess = REP.CurrentEstimation;
+
+    if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
+      break;
+    } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
+      break;
+    }
+
+    Iterations *= Options.ScalingFactor;
+  }
+  Result.Cycles = BestGuess;
+  Result.Samples = Samples;
+  Result.TotalIterations = TotalIterations;
+  return Result;
+};
+
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
new file mode 100644
index 0000000000000..ccbbe3629dbda
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -0,0 +1,122 @@
+#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
+#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
+
+#include "benchmarks/gpu/timing/timing.h"
+
+#include "benchmarks/gpu/BenchmarkLogger.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+
+namespace libc_gpu_benchmarks {
+
+struct BenchmarkOptions {
+  uint32_t InitialIterations = 1;
+  uint32_t MaxIterations = 10000000;
+  uint32_t MinSamples = 4;
+  uint32_t MaxSamples = 1000;
+  double Epsilon = 0.01;
+  double ScalingFactor = 1.4;
+};
+
+struct Measurement {
+  size_t Iterations = 0;
+  uint64_t ElapsedCycles = 0;
+};
+
+class RefinableRuntimeEstimation {
+  uint64_t TotalCycles = 0;
+  size_t TotalIterations = 0;
+
+public:
+  uint64_t Update(const Measurement &M) {
+    TotalCycles += M.ElapsedCycles;
+    TotalIterations += M.Iterations;
+    return TotalCycles / TotalIterations;
+  }
+};
+
+// Tracks the progression of the runtime estimation
+class RuntimeEstimationProgression {
+  RefinableRuntimeEstimation RRE;
+
+public:
+  uint64_t CurrentEstimation = 0;
+
+  double ComputeImprovement(const Measurement &M) {
+    const uint64_t NewEstimation = RRE.Update(M);
+    double Ratio = ((double)CurrentEstimation / NewEstimation) - 1.0;
+
+    // Get absolute value
+    if (Ratio < 0) {
+      Ratio *= -1;
+    }
+
+    CurrentEstimation = NewEstimation;
+    return Ratio;
+  }
+};
+
+struct BenchmarkResult {
+  uint64_t Cycles = 0;
+  size_t Samples = 0;
+  size_t TotalIterations = 0;
+};
+
+BenchmarkResult benchmark(const BenchmarkOptions &Options,
+                          uint64_t (*WrapperFunc)());
+
+class Benchmark {
+  Benchmark *Next = nullptr;
+
+public:
+  virtual ~Benchmark() {}
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  static int runBenchmarks();
+
+protected:
+  static void addBenchmark(Benchmark *);
+
+private:
+  virtual void Run() = 0;
+  virtual const char *getName() const = 0;
+
+  static Benchmark *Start;
+  static Benchmark *End;
+};
+
+class WrapperBenchmark : public Benchmark {
+  using BenchmarkWrapperFunction = uint64_t (*)();
+  BenchmarkWrapperFunction Func;
+  const char *Name;
+
+public:
+  WrapperBenchmark(BenchmarkWrapperFunction Func, char const *Name)
+      : Func(Func), Name(Name) {
+    addBenchmark(this);
+  }
+
+private:
+  void Run() override {
+    BenchmarkOptions Options;
+    auto result = benchmark(Options, Func);
+    constexpr auto GREEN = "\033[32m";
+    constexpr auto RESET = "\033[0m";
+    blog << GREEN << "[ RUN      ] " << RESET << Name << '\n';
+    blog << GREEN << "[       OK ] " << RESET << Name << ": " << result.Cycles
+         << " cycles, " << result.TotalIterations << " iterations\n";
+  }
+  const char *getName() const override { return Name; }
+};
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
+
+#define BENCHMARK(SuiteName, TestName, Func)                                   \
+  LIBC_NAMESPACE::libc_gpu_benchmarks::WrapperBenchmark                        \
+      SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
+
+#endif
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
new file mode 100644
index 0000000000000..c971b00cc9a1b
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
@@ -0,0 +1,6 @@
+#include "LibcGpuBenchmark.h"
+
+extern "C" int main(int argc, char **argv, char **envp) {
+  LIBC_NAMESPACE::libc_gpu_benchmarks::Benchmark::runBenchmarks();
+  return 0;
+}
diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt
new file mode 100644
index 0000000000000..f15d082e4dd2b
--- /dev/null
+++ b/libc/benchmarks/gpu/src/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(ctype)
+add_subdirectory(math)
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
new file mode 100644
index 0000000000000..ab2f6cdf0c7fd
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_custom_target(libc-gpu-ctype-benchmarks)
+
+add_gpu_benchmark(
+    isalnum_benchmark
+    SUITE
+        libc-gpu-ctype-benchmarks
+    SRCS
+        isalnum_benchmark.cpp
+    DEPENDS
+        libc.src.ctype.isalnum
+)
+
+add_gpu_benchmark(
+    isalpha_benchmark
+    SUITE
+        libc-gpu-ctype-benchmarks
+    SRCS
+        isalpha_benchmark.cpp
+    DEPENDS
+        libc.src.ctype.isalpha
+)
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
new file mode 100644
index 0000000000000..8d9c958bb7ed4
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -0,0 +1,22 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/ctype/isalnum.h"
+
+uint64_t BM_IsAlnum() {
+  char x = 'c';
+  return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
+}
+BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper, BM_IsAlnum);
+
+[[gnu::noinline]] static uint64_t single_input_function(int x) {
+  asm volatile("" ::"r"(x)); // prevent the compiler from optimizing out x
+  return x;
+}
+
+uint64_t BM_IsAlnumWithOverhead() {
+  char x = 'c';
+  return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x) -
+         LIBC_NAMESPACE::latency(single_input_function, 0);
+}
+BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWithOverhead,
+          BM_IsAlnumWithOverhead);
diff --git a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
new file mode 100644
index 0000000000000..2038eb89bc77b
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
@@ -0,0 +1,9 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/ctype/isalpha.h"
+
+uint64_t BM_IsAlpha() {
+  char x = 'c';
+  return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
+}
+BENCHMARK(LlvmLibcIsAlphaGpuBenchmark, IsAlpha, BM_IsAlpha);
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/libc/benchmarks/gpu/timing/CMakeLists.txt b/libc/benchmarks/gpu/timing/CMakeLists.txt
new file mode 100644
index 0000000000000..0e6a5a6b47968
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/CMakeLists.txt
@@ -0,0 +1,12 @@
+foreach(target nvptx)
+    add_subdirectory(${target})
+    list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing)
+endforeach()
+
+add_header_library(
+    timing
+    HDRS
+        timing.h
+    DEPENDS
+        ${target_gpu_timing}
+)
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
new file mode 100644
index 0000000000000..9958e16206a41
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_header_library(
+  nvptx_timing
+  HDRS
+    timing.h
+  DEPENDS
+    libc.src.__support.common
+)
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
new file mode 100644
index 0000000000000..008432e6aa1d2
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -0,0 +1,108 @@
+//===------------- NVPTX implementation of timing utils ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+
+// Returns the overhead associated with calling the profiling region. This
+// allows us to substract the constant-time overhead from the latency to
+// obtain a true result. This can vary with system load.
+[[gnu::noinline]] static uint64_t overhead() {
+  volatile uint32_t x = 1;
+  uint32_t y = x;
+  gpu::sync_threads();
+  uint64_t start = gpu::processor_clock();
+  asm volatile("" ::"r"(y), "llr"(start));
+  uint32_t result = y;
+  asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  uint64_t stop = gpu::processor_clock();
+  gpu::sync_threads();
+  volatile auto storage = result;
+  return stop - start;
+}
+
+// Stimulate a simple function and obtain its latency in clock cycles on the
+// system. This function cannot be inlined or else it will disturb the very
+// deliccate balance of hard-coded dependencies.
+//
+// FIXME: This does not work in general on NVPTX because of further
+// optimizations ptxas performs. The only way to get consistent results is to
+// pass and extra "SHELL:-Xcuda-ptxas -O0" to CMake's compiler flag. This
+// negatively implacts performance but it is at least stable.
+template <typename F, typename T>
+[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
+  // We need to store the input somewhere to guarantee that the compiler will
+  // not constant propagate it and remove the profiling region.
+  volatile T storage = t;
+  T arg = storage;
+  asm volatile("" ::"r"(arg));
+
+  // Get the current timestamp from the clock.
+  gpu::sync_threads();
+  __nvvm_membar_sys();
+  uint64_t start = gpu::processor_clock();
+
+  // This forces the compiler to load the input argument and run the clock cycle
+  // counter before the profiling region.
+  asm volatile("" ::"r"(arg), "llr"(start));
+
+  // Run the function under test and return its value.
+  auto result = f(arg);
+
+  // This inline assembly performs a no-op which forces the result to both be
+  // used and prevents us from exiting this region before it's complete.
+  asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+
+  // Obtain the current timestamp after running the calculation and force
+  // ordering.
+  uint64_t stop = gpu::processor_clock();
+  __nvvm_membar_sys();
+  gpu::sync_threads();
+  asm volatile("" ::"r"(stop));
+  volatile T output = result;
+
+  // Return the time elapsed.
+  return stop - start;
+}
+
+template <typename F, typename T1, typename T2>
+static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
+  volatile T1 storage = t1;
+  volatile T2 storage2 = t2;
+  T1 arg = storage;
+  T2 arg2 = storage2;
+  asm volatile("" ::"r"(arg), "r"(arg2));
+
+  gpu::sync_threads();
+  uint64_t start = gpu::processor_clock();
+
+  asm volatile("" ::"r"(arg), "r"(arg2), "llr"(start));
+
+  auto result = f(arg, arg2);
+
+  asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+
+  uint64_t stop = gpu::processor_clock();
+  gpu::sync_threads();
+  asm volatile("" ::"r"(stop));
+  volatile auto output = result;
+
+  return stop - start;
+}
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
diff --git a/libc/benchmarks/gpu/timing/timing.h b/libc/benchmarks/gpu/timing/timing.h
new file mode 100644
index 0000000000000..c47bb0d9ebb55
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/timing.h
@@ -0,0 +1,22 @@
+//===------------- Implementation of GPU timing utils -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_H
+#define LLVM_LIBC_UTILS_GPU_TIMING_H
+
+#include "src/__support/macros/properties/architectures.h"
+
+#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+#error "amdgpu not yet supported
+#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
+#include "nvptx/timing.h"
+#else
+#error "unsupported platform"
+#endif
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_H

>From f8291e91be692061ab3d240d78a2112b89cbc342 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 15 May 2024 12:47:40 -0400
Subject: [PATCH 02/18] refactor cmake rules

---
 libc/benchmarks/CMakeLists.txt               | 397 +++++++++----------
 libc/benchmarks/gpu/BenchmarkLogger.cpp      |  24 +-
 libc/benchmarks/gpu/CMakeLists.txt           | 162 +-------
 libc/benchmarks/gpu/src/ctype/CMakeLists.txt |  32 +-
 libc/benchmarks/gpu/timing/CMakeLists.txt    |  14 +-
 libc/benchmarks/gpu/timing/timing.h          |   2 +-
 libc/cmake/modules/LLVMLibCTestRules.cmake   |  10 +-
 7 files changed, 259 insertions(+), 382 deletions(-)

diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index a802e653a091e..8b51511e3b5cf 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -1,211 +1,210 @@
-if(NOT LIBC_TARGET_OS_IS_GPU)
-	find_package(Threads)
-
-	set(LLVM_LINK_COMPONENTS
-	  Support
-	  TargetParser
-	  )
-
-	#==============================================================================
-	# Add Unit Testing Support
-	#==============================================================================
-
-	function(add_libc_benchmark_unittest target_name)
-	  if(NOT LLVM_INCLUDE_TESTS)
-	    return()
-	  endif()
-
-	  cmake_parse_arguments(
-	    "LIBC_BENCHMARKS_UNITTEST"
-	    "" # No optional arguments
-	    "SUITE" # Single value arguments
-	    "SRCS;DEPENDS" # Multi-value arguments
-	    ${ARGN}
-	  )
-
-	  add_executable(${target_name}
-	    EXCLUDE_FROM_ALL
-	    ${LIBC_BENCHMARKS_UNITTEST_SRCS}
-	  )
-	  target_link_libraries(${target_name}
-	    PRIVATE
-	    llvm_gtest_main
-	    llvm_gtest
-	    ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
-	  )
-	  llvm_update_compile_flags(${target_name})
-
-	  add_custom_command(
-	    TARGET ${target_name}
-	    POST_BUILD
-	    COMMAND $<TARGET_FILE:${target_name}>
-	  )
-	  add_dependencies(libc-benchmark-util-tests ${target_name})
-	endfunction()
-
-	#==============================================================================
-	# Build Google Benchmark for libc
-	#==============================================================================
-
-	include(ExternalProject)
-	ExternalProject_Add(google-benchmark-libc
-		EXCLUDE_FROM_ALL ON
-		PREFIX google-benchmark-libc
-		SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
-		INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
-		CMAKE_CACHE_ARGS
-		  -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
-		  -DBENCHMARK_ENABLE_LTO:BOOL=OFF
-		  -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
-		  -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
-		  -DBENCHMARK_FORCE_WERROR:BOOL=OFF
-		  -DBENCHMARK_USE_LIBCXX:BOOL=OFF
-		  -DCMAKE_BUILD_TYPE:STRING=Release
-
-		  -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
-		  -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
-		  -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-		  -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-		  -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
-		  -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
-
-		  -DBUILD_SHARED_LIBS:BOOL=OFF
-		  -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
-
-		  -DCMAKE_CXX_STANDARD:STRING=14
-		  -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
-		)
-
-	add_custom_target(libc-benchmark-util-tests)
-
-	# libc-benchmark
-	add_library(libc-benchmark
-	    STATIC
-	    EXCLUDE_FROM_ALL
-	    LibcBenchmark.cpp
-	    LibcBenchmark.h
-	)
+if(LIBC_TARGET_OS_IS_GPU)
+	add_subdirectory(gpu)
+	return()
+endif()
 
-	target_include_directories(libc-benchmark
-	    PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
-	)
-	target_link_libraries(libc-benchmark
-	    PUBLIC
-	    benchmark::benchmark
-            LLVMSupport
-	    LLVMTargetParser
-	    Threads::Threads
-	)
-	add_dependencies(libc-benchmark google-benchmark-libc)
-	llvm_update_compile_flags(libc-benchmark)
+find_package(Threads)
 
-	add_libc_benchmark_unittest(libc-benchmark-test
-	    SRCS LibcBenchmarkTest.cpp
-	    DEPENDS libc-benchmark
+set(LLVM_LINK_COMPONENTS
+	Support
+	TargetParser
 	)
 
-	# libc-memory-benchmark
-	add_library(libc-memory-benchmark
-	    STATIC
-	    EXCLUDE_FROM_ALL
-	    LibcMemoryBenchmark.cpp
-	    LibcMemoryBenchmark.h
-	    LibcFunctionPrototypes.h
-	    MemorySizeDistributions.cpp
-	    MemorySizeDistributions.h
-	)
-	target_include_directories(libc-memory-benchmark
-	    PUBLIC
-	    ${CMAKE_CURRENT_SOURCE_DIR}
+#==============================================================================
+# Add Unit Testing Support
+#==============================================================================
+
+function(add_libc_benchmark_unittest target_name)
+	if(NOT LLVM_INCLUDE_TESTS)
+	return()
+	endif()
+
+	cmake_parse_arguments(
+	"LIBC_BENCHMARKS_UNITTEST"
+	"" # No optional arguments
+	"SUITE" # Single value arguments
+	"SRCS;DEPENDS" # Multi-value arguments
+	${ARGN}
 	)
-	target_link_libraries(libc-memory-benchmark
-	    PUBLIC
-	    libc-benchmark
-	)
-	llvm_update_compile_flags(libc-memory-benchmark)
 
-	add_libc_benchmark_unittest(libc-memory-benchmark-test
-	    SRCS LibcMemoryBenchmarkTest.cpp
-	    DEPENDS libc-memory-benchmark
+	add_executable(${target_name}
+	EXCLUDE_FROM_ALL
+	${LIBC_BENCHMARKS_UNITTEST_SRCS}
 	)
-
-	# json
-	add_library(json
-	    STATIC
-	    EXCLUDE_FROM_ALL
-	    JSON.cpp
-	    JSON.h
+	target_link_libraries(${target_name}
+	PRIVATE
+	llvm_gtest_main
+	llvm_gtest
+	${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
 	)
-	target_link_libraries(json PUBLIC libc-memory-benchmark)
-	llvm_update_compile_flags(json)
+	llvm_update_compile_flags(${target_name})
 
-	add_libc_benchmark_unittest(json-test
-	    SRCS JSONTest.cpp
-	    DEPENDS json
+	add_custom_command(
+	TARGET ${target_name}
+	POST_BUILD
+	COMMAND $<TARGET_FILE:${target_name}>
 	)
-
-	#==============================================================================
-	# Benchmarking tool
-	#==============================================================================
-
-	# Benchmark all implementations that can run on the target CPU.
-	function(add_libc_multi_impl_benchmark name)
-	  get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
-	  foreach(fq_config_name IN LISTS fq_implementations)
-	    get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
-	    cpu_supports(can_run "${required_cpu_features}")
-	    if(can_run)
-		set(benchmark_name ${fq_config_name}_benchmark)
-		add_executable(${benchmark_name}
-		    EXCLUDE_FROM_ALL
-		    LibcMemoryBenchmarkMain.cpp
-		)
-		get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
-		target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
-		string(TOUPPER ${name} name_upper)
-		target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
-		llvm_update_compile_flags(${benchmark_name})
-	    else()
-	      message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
-	    endif()
-	  endforeach()
-	endfunction()
-
-	add_libc_multi_impl_benchmark(bcmp)
-	add_libc_multi_impl_benchmark(bzero)
-	add_libc_multi_impl_benchmark(memcmp)
-	add_libc_multi_impl_benchmark(memcpy)
-	add_libc_multi_impl_benchmark(memmove)
-	add_libc_multi_impl_benchmark(memset)
-
-	#==============================================================================
-	# Google Benchmarking tool
-	#==============================================================================
-
-	# This target uses the Google Benchmark facility to report throughput for llvm
-	# libc memory functions compiled for the host machine. This is useful to
-	# continuously monitor the performance of the memory functions.
-	add_executable(libc.benchmarks.memory_functions.opt_host
-	  EXCLUDE_FROM_ALL
-	  LibcMemoryGoogleBenchmarkMain.cpp
-	  LibcDefaultImplementations.cpp
-	)
-	target_link_libraries(libc.benchmarks.memory_functions.opt_host
-	  PRIVATE
-	  libc-memory-benchmark
-	  libc.src.string.memcmp_opt_host.__internal__
-	  libc.src.string.bcmp_opt_host.__internal__
-	  libc.src.string.memcpy_opt_host.__internal__
-	  libc.src.string.memset_opt_host.__internal__
-	  libc.src.string.bzero_opt_host.__internal__
-	  libc.src.string.memmove_opt_host.__internal__
-	  benchmark_main
+	add_dependencies(libc-benchmark-util-tests ${target_name})
+endfunction()
+
+#==============================================================================
+# Build Google Benchmark for libc
+#==============================================================================
+
+include(ExternalProject)
+ExternalProject_Add(google-benchmark-libc
+	EXCLUDE_FROM_ALL ON
+	PREFIX google-benchmark-libc
+	SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
+	INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
+	CMAKE_CACHE_ARGS
+		-DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
+		-DBENCHMARK_ENABLE_LTO:BOOL=OFF
+		-DBENCHMARK_ENABLE_TESTING:BOOL=OFF
+		-DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
+		-DBENCHMARK_FORCE_WERROR:BOOL=OFF
+		-DBENCHMARK_USE_LIBCXX:BOOL=OFF
+		-DCMAKE_BUILD_TYPE:STRING=Release
+
+		-DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
+		-DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
+		-DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
+		-DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
+		-DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
+		-DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
+
+		-DBUILD_SHARED_LIBS:BOOL=OFF
+		-DCMAKE_EXE_LINKER_FLAGS:STRING=-static
+
+		-DCMAKE_CXX_STANDARD:STRING=14
+		-DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
 	)
-	llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host)
-
-	add_subdirectory(automemcpy)
-endif()
 
-if(LIBC_TARGET_OS_IS_GPU)
-	add_subdirectory(gpu)
-endif()
+add_custom_target(libc-benchmark-util-tests)
+
+# libc-benchmark
+add_library(libc-benchmark
+	STATIC
+	EXCLUDE_FROM_ALL
+	LibcBenchmark.cpp
+	LibcBenchmark.h
+)
+
+target_include_directories(libc-benchmark
+	PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
+)
+target_link_libraries(libc-benchmark
+	PUBLIC
+	benchmark::benchmark
+		LLVMSupport
+	LLVMTargetParser
+	Threads::Threads
+)
+add_dependencies(libc-benchmark google-benchmark-libc)
+llvm_update_compile_flags(libc-benchmark)
+
+add_libc_benchmark_unittest(libc-benchmark-test
+	SRCS LibcBenchmarkTest.cpp
+	DEPENDS libc-benchmark
+)
+
+# libc-memory-benchmark
+add_library(libc-memory-benchmark
+	STATIC
+	EXCLUDE_FROM_ALL
+	LibcMemoryBenchmark.cpp
+	LibcMemoryBenchmark.h
+	LibcFunctionPrototypes.h
+	MemorySizeDistributions.cpp
+	MemorySizeDistributions.h
+)
+target_include_directories(libc-memory-benchmark
+	PUBLIC
+	${CMAKE_CURRENT_SOURCE_DIR}
+)
+target_link_libraries(libc-memory-benchmark
+	PUBLIC
+	libc-benchmark
+)
+llvm_update_compile_flags(libc-memory-benchmark)
+
+add_libc_benchmark_unittest(libc-memory-benchmark-test
+	SRCS LibcMemoryBenchmarkTest.cpp
+	DEPENDS libc-memory-benchmark
+)
+
+# json
+add_library(json
+	STATIC
+	EXCLUDE_FROM_ALL
+	JSON.cpp
+	JSON.h
+)
+target_link_libraries(json PUBLIC libc-memory-benchmark)
+llvm_update_compile_flags(json)
+
+add_libc_benchmark_unittest(json-test
+	SRCS JSONTest.cpp
+	DEPENDS json
+)
+
+#==============================================================================
+# Benchmarking tool
+#==============================================================================
+
+# Benchmark all implementations that can run on the target CPU.
+function(add_libc_multi_impl_benchmark name)
+	get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
+	foreach(fq_config_name IN LISTS fq_implementations)
+	get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
+	cpu_supports(can_run "${required_cpu_features}")
+	if(can_run)
+	set(benchmark_name ${fq_config_name}_benchmark)
+	add_executable(${benchmark_name}
+		EXCLUDE_FROM_ALL
+		LibcMemoryBenchmarkMain.cpp
+	)
+	get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
+	target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
+	string(TOUPPER ${name} name_upper)
+	target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
+	llvm_update_compile_flags(${benchmark_name})
+	else()
+		message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
+	endif()
+	endforeach()
+endfunction()
+
+add_libc_multi_impl_benchmark(bcmp)
+add_libc_multi_impl_benchmark(bzero)
+add_libc_multi_impl_benchmark(memcmp)
+add_libc_multi_impl_benchmark(memcpy)
+add_libc_multi_impl_benchmark(memmove)
+add_libc_multi_impl_benchmark(memset)
+
+#==============================================================================
+# Google Benchmarking tool
+#==============================================================================
+
+# This target uses the Google Benchmark facility to report throughput for llvm
+# libc memory functions compiled for the host machine. This is useful to
+# continuously monitor the performance of the memory functions.
+add_executable(libc.benchmarks.memory_functions.opt_host
+	EXCLUDE_FROM_ALL
+	LibcMemoryGoogleBenchmarkMain.cpp
+	LibcDefaultImplementations.cpp
+)
+target_link_libraries(libc.benchmarks.memory_functions.opt_host
+	PRIVATE
+	libc-memory-benchmark
+	libc.src.string.memcmp_opt_host.__internal__
+	libc.src.string.bcmp_opt_host.__internal__
+	libc.src.string.memcpy_opt_host.__internal__
+	libc.src.string.memset_opt_host.__internal__
+	libc.src.string.bzero_opt_host.__internal__
+	libc.src.string.memmove_opt_host.__internal__
+	benchmark_main
+)
+llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host)
+
+add_subdirectory(automemcpy)
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
index 94a0d897c9585..4f70d23a1e95e 100644
--- a/libc/benchmarks/gpu/BenchmarkLogger.cpp
+++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp
@@ -13,18 +13,21 @@ namespace libc_gpu_benchmarks {
 
 // cpp::string_view specialization
 template <>
-BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) {
+BenchmarkLogger &
+    BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) {
   LIBC_NAMESPACE::write_to_stderr(str);
   return *this;
 }
 
 // cpp::string specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) {
+template <>
+BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) {
   return *this << static_cast<cpp::string_view>(str);
 }
 
 // const char* specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) {
+template <>
+BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) {
   return *this << cpp::string_view(str);
 }
 
@@ -66,15 +69,20 @@ template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short);
 template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int);
 template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long);
 template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long);
-template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned char>(unsigned char);
-template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned short>(unsigned short);
-template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned int>(unsigned int);
-template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned long>(unsigned long);
+template BenchmarkLogger &
+    BenchmarkLogger::operator<< <unsigned char>(unsigned char);
+template BenchmarkLogger &
+    BenchmarkLogger::operator<< <unsigned short>(unsigned short);
+template BenchmarkLogger &
+    BenchmarkLogger::operator<< <unsigned int>(unsigned int);
+template BenchmarkLogger &
+    BenchmarkLogger::operator<< <unsigned long>(unsigned long);
 template BenchmarkLogger &
     BenchmarkLogger::operator<< <unsigned long long>(unsigned long long);
 
 #ifdef LIBC_TYPES_HAS_INT128
-template BenchmarkLogger &BenchmarkLogger::operator<< <__uint128_t>(__uint128_t);
+template BenchmarkLogger &
+    BenchmarkLogger::operator<< <__uint128_t>(__uint128_t);
 #endif // LIBC_TYPES_HAS_INT128
 template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>);
 template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>);
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index a18be27e33573..5dafe66bbd738 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -2,161 +2,25 @@ add_subdirectory(timing)
 
 add_custom_target(gpu-benchmark)
 
-function (add_gpu_benchmark test_name)
-  if(NOT TARGET libc.startup.${LIBC_TARGET_OS}.crt1)
-    message(VERBOSE "Skipping ${fq_target_name} as it is not available on ${LIBC_TARGET_OS}.")
-    return()
-  endif()
-
+function(add_benchmark benchmark_name)
   cmake_parse_arguments(
-    "GPU_BENCHMARK"
-    "" # No optional arguments
-    "SUITE" # Single value arguments
-    "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
+    "BENCHMARK"
+    "LINK_LIBRARIES" # Optional arguments
+    "" # Single value arguments
+    "" # Multi-value arguments
     ${ARGN}
   )
-
-  if(NOT GPU_BENCHMARK_SUITE)
-    message(FATAL_ERROR "SUITE not specified for ${fq_target_name}")
-  endif()
-  if(NOT GPU_BENCHMARK_SRCS)
-    message(FATAL_ERROR "The SRCS list for add_gpu_benchmark is missing.")
-  endif()
-
-  get_fq_target_name(${test_name} fq_target_name)
-  get_fq_target_name(${test_name}.libc fq_libc_target_name) # Stores the compiled libc + infrastructure archive to link in
-  get_fq_deps_list(fq_deps_list ${GPU_BENCHMARK_DEPENDS})
-  list(APPEND fq_deps_list
-      # Hermetic tests use the platform's startup object. So, their deps also
-      # have to be collected.
-      libc.startup.${LIBC_TARGET_OS}.crt1
-      # We always add the memory functions objects. This is because the
-      # compiler's codegen can emit calls to the C memory functions.
-      libc.src.string.bcmp
-      libc.src.string.bzero
-      libc.src.string.memcmp
-      libc.src.string.memcpy
-      libc.src.string.memmove
-      libc.src.string.memset
-      libc.src.__support.StringUtil.error_to_string
-  )
-
-  list(REMOVE_DUPLICATES fq_deps_list)
-
-  # TODO: Instead of gathering internal object files from entrypoints,
-  # collect the object files with public names of entrypoints.
-  get_object_files_for_test(
-      link_object_files skipped_entrypoints_list ${fq_deps_list})
-  if(skipped_entrypoints_list)
-    if(LIBC_CMAKE_VERBOSE_LOGGING)
-      set(msg "Skipping hermetic test ${fq_target_name} as it has missing deps: "
-              "${skipped_entrypoints_list}.")
-    endif()
-    return()
-  endif()
-  list(REMOVE_DUPLICATES link_object_files)
-
-  # Make a library of all deps
-  add_library(
-    ${fq_target_name}.__libc__
-    STATIC
-    EXCLUDE_FROM_ALL
-    ${link_object_files}
-  )
-  set_target_properties(${fq_target_name}.__libc__
-      PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-  set_target_properties(${fq_target_name}.__libc__
-      PROPERTIES ARCHIVE_OUTPUT_NAME ${fq_target_name}.libc)
-
-  set(fq_build_target_name ${fq_target_name}.__build__)
-  add_executable(
-    ${fq_build_target_name}
-    EXCLUDE_FROM_ALL
-    $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
-    ${GPU_BENCHMARK_SRCS}
-    ${GPU_BENCHMARK_HDRS}
-  )
-  set_target_properties(${fq_build_target_name}
-    PROPERTIES
-      RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-  )
-
-  _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
-  target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
-  target_compile_options(${fq_build_target_name} PRIVATE ${compile_options}) 
-
-  set(link_libraries "")
-  foreach(lib in LISTS GPU_BENCHMARK_LINK_LIBRARIES)
-    if(TARGET ${lib}.hermetic)
-      list(APPEND link_libraries ${lib}.hermetic)
-    else()
-      list(APPEND link_libraries ${lib})
-    endif()
-  endforeach()
-
-  if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-    target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT}
-      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -Wno-multi-gpu
-      "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
-      "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
-  elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-    # We need to use the internal object versions for NVPTX.
-    set(internal_suffix ".__internal__")
-    target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
-      "-Wl,--suppress-stack-size-warning"
-      -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
-      "--cuda-path=${LIBC_CUDA_ROOT}")
-  elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
-    target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
-  else()
-    # Older version of gcc does not support `nostdlib++` flag.  We use
-    # `nostdlib` and link against libgcc_s, which cannot be linked statically.
-    target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib)
-    list(APPEND link_libraries ${LIBGCC_S_LOCATION})
-  endif()
-
-  # link libraries for the BUILD target (i.e. to compile the test)
-  target_link_libraries(
-    ${fq_build_target_name}
-    PRIVATE
-      libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
-      ${link_libraries}
-      # LibcTest.hermetic
+  add_libc_hermetic_test(
+    ${benchmark_name}
+    IS_BENCHMARK
+    LINK_LIBRARIES
       LibcGpuBenchmark.hermetic
-      # LibcHermeticTestSupport.hermetic
-      LibcHermeticTestSupport.hermetic
-      # The NVIDIA 'nvlink' linker does not currently support static libraries.
-      $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
-
-  add_dependencies(${fq_build_target_name}
-    LibcGpuBenchmark.hermetic
-  ${fq_deps_list})
-
-  # Tests on the GPU require an external loader utility to launch the kernel.
-  if(TARGET libc.utils.gpu.loader)
-    add_dependencies(${fq_build_target_name} libc.utils.gpu.loader)
-    get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE")
-  endif()
-
-  set(test_cmd ${GPU_BENCHMARK_ENV}
-      $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${GPU_BENCHMARK_LOADER_ARGS}
-      $<TARGET_FILE:${fq_build_target_name}> ${GPU_BENCHMARK_ARGS})
-  add_custom_target(
-    ${fq_target_name}
-    COMMAND ${test_cmd}
-    COMMAND_EXPAND_LISTS
-    COMMENT "Running GPU benchmark ${fq_target_name}"
+      ${BENCHMARK_LINK_LIBRARIES}
+    ${BENCHMARK_UNPARSED_ARGUMENTS}
   )
-
-  # Make this benchmark part of its suite
-  add_dependencies(${GPU_BENCHMARK_SUITE} ${fq_target_name})
-  # Remember to make this benchmark part of the umbrella command
+  get_fq_target_name(${benchmark_name} fq_target_name)
   add_dependencies(gpu-benchmark ${fq_target_name})
-endfunction(add_gpu_benchmark)
+endfunction(add_benchmark)
 
 add_unittest_framework_library(
   LibcGpuBenchmark
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
index ab2f6cdf0c7fd..8d448b8ced955 100644
--- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -1,21 +1,21 @@
 add_custom_target(libc-gpu-ctype-benchmarks)
 
-add_gpu_benchmark(
-    isalnum_benchmark
-    SUITE
-        libc-gpu-ctype-benchmarks
-    SRCS
-        isalnum_benchmark.cpp
-    DEPENDS
-        libc.src.ctype.isalnum
+add_benchmark(
+	isalnum_benchmark
+	SUITE
+		libc-gpu-ctype-benchmarks
+	SRCS
+		isalnum_benchmark.cpp
+	DEPENDS
+		libc.src.ctype.isalnum
 )
 
-add_gpu_benchmark(
-    isalpha_benchmark
-    SUITE
-        libc-gpu-ctype-benchmarks
-    SRCS
-        isalpha_benchmark.cpp
-    DEPENDS
-        libc.src.ctype.isalpha
+add_benchmark(
+	isalpha_benchmark
+	SUITE
+		libc-gpu-ctype-benchmarks
+	SRCS
+		isalpha_benchmark.cpp
+	DEPENDS
+		libc.src.ctype.isalpha
 )
diff --git a/libc/benchmarks/gpu/timing/CMakeLists.txt b/libc/benchmarks/gpu/timing/CMakeLists.txt
index 0e6a5a6b47968..8bbc7e33f122a 100644
--- a/libc/benchmarks/gpu/timing/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/CMakeLists.txt
@@ -1,12 +1,12 @@
 foreach(target nvptx)
-    add_subdirectory(${target})
-    list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing)
+  add_subdirectory(${target})
+  list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing)
 endforeach()
 
 add_header_library(
-    timing
-    HDRS
-        timing.h
-    DEPENDS
-        ${target_gpu_timing}
+  timing
+  HDRS
+    timing.h
+  DEPENDS
+    ${target_gpu_timing}
 )
diff --git a/libc/benchmarks/gpu/timing/timing.h b/libc/benchmarks/gpu/timing/timing.h
index c47bb0d9ebb55..180ea77954ae5 100644
--- a/libc/benchmarks/gpu/timing/timing.h
+++ b/libc/benchmarks/gpu/timing/timing.h
@@ -12,7 +12,7 @@
 #include "src/__support/macros/properties/architectures.h"
 
 #if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
-#error "amdgpu not yet supported
+#error "amdgpu not yet supported"
 #elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
 #include "nvptx/timing.h"
 #else
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index c8d7c8a2b1c7c..278137774e089 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -550,7 +550,7 @@ function(add_libc_hermetic_test test_name)
   endif()
   cmake_parse_arguments(
     "HERMETIC_TEST"
-    "" # No optional arguments
+    "IS_BENCHMARK" # Optional arguments
     "SUITE" # Single value arguments
     "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
     ${ARGN}
@@ -651,6 +651,13 @@ function(add_libc_hermetic_test test_name)
     endif()
   endforeach()
 
+  # Benchmarks requires a separate library with a different `main` function
+  if(HERMETIC_TEST_IS_BENCHMARK)
+    list(APPEND link_libraries LibcGpuBenchmark.hermetic)
+  else()
+    list(APPEND link_libraries LibcTest.hermetic)
+  endif()
+
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT}
@@ -678,7 +685,6 @@ function(add_libc_hermetic_test test_name)
     PRIVATE
       libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
       ${link_libraries}
-      LibcTest.hermetic
       LibcHermeticTestSupport.hermetic
       # The NVIDIA 'nvlink' linker does not currently support static libraries.
       $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)

>From 1129ccc33651c46ec22c6cd3d679abbb1829b3ba Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Thu, 16 May 2024 13:08:37 -0400
Subject: [PATCH 03/18] fix code style

---
 libc/benchmarks/gpu/CMakeLists.txt           |   1 +
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp     |  87 ++++++++-------
 libc/benchmarks/gpu/LibcGpuBenchmark.h       | 105 +++++++++----------
 libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp |   2 +-
 4 files changed, 97 insertions(+), 98 deletions(-)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 5dafe66bbd738..db2953f6fcf23 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -37,6 +37,7 @@ add_unittest_framework_library(
     libc.src.__support.CPP.string
     libc.src.__support.CPP.string_view
     libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.functional
     libc.src.__support.fixed_point.fx_rep
     libc.src.__support.macros.properties.types
     libc.src.__support.OSUtil.osutil
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index d37f5a0a53a70..087b59689d90b 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -3,67 +3,66 @@
 namespace LIBC_NAMESPACE {
 namespace libc_gpu_benchmarks {
 
-Benchmark *Benchmark::Start = nullptr;
-Benchmark *Benchmark::End = nullptr;
+Benchmark *Benchmark::start = nullptr;
+Benchmark *Benchmark::end = nullptr;
 
-void Benchmark::addBenchmark(Benchmark *B) {
-  if (End == nullptr) {
-    Start = B;
-    End = B;
+void Benchmark::add_benchmark(Benchmark *benchmark) {
+  if (end == nullptr) {
+    start = benchmark;
+    end = benchmark;
     return;
   }
-
-  End->Next = B;
-  End = B;
+  end->next = benchmark;
+  end = benchmark;
 }
 
-int Benchmark::runBenchmarks() {
-  for (Benchmark *B = Start; B != nullptr; B = B->Next) {
-    B->Run();
-  }
-
+int Benchmark::run_benchmarks() {
+  for (Benchmark *b = start; b != nullptr; b = b->next)
+    b->run();
   return 0;
 }
 
-BenchmarkResult benchmark(const BenchmarkOptions &Options,
-                          uint64_t (*WrapperFunc)()) {
-  BenchmarkResult Result;
-  RuntimeEstimationProgression REP;
-  size_t TotalIterations = 0;
-  size_t Iterations = Options.InitialIterations;
-  if (Iterations < (uint32_t)1) {
-    Iterations = 1;
-  }
-  size_t Samples = 0;
-  uint64_t BestGuess = 0;
-  uint64_t TotalCycles = 0;
+BenchmarkResult benchmark(const BenchmarkOptions &options,
+                          cpp::function<uint64_t(void)> wrapper_func) {
+  BenchmarkResult result;
+  RuntimeEstimationProgression rep;
+  size_t total_iterations = 0;
+  size_t iterations = options.initial_iterations;
+  if (iterations < (uint32_t)1)
+    iterations = 1;
+
+  size_t samples = 0;
+  uint64_t best_guess = 0;
+  uint64_t total_cycles = 0;
   for (;;) {
-    uint64_t SampleCycles = 0;
-    for (uint32_t i = 0; i < Iterations; i++) {
-      auto overhead = LIBC_NAMESPACE::overhead();
-      uint64_t result = WrapperFunc() - overhead;
-      SampleCycles += result;
+    uint64_t sample_cycles = 0;
+    uint64_t overhead = LIBC_NAMESPACE::overhead();
+    for (uint32_t i = 0; i < iterations; i++) {
+      uint64_t result = wrapper_func() - overhead;
+      sample_cycles += result;
     }
 
-    Samples++;
-    TotalCycles += SampleCycles;
-    TotalIterations += Iterations;
-    const double ChangeRatio =
-        REP.ComputeImprovement({Iterations, SampleCycles});
-    BestGuess = REP.CurrentEstimation;
+    samples++;
+    total_cycles += sample_cycles;
+    total_iterations += iterations;
+    const double change_ratio =
+        rep.compute_improvement({iterations, sample_cycles});
+    best_guess = rep.current_estimation;
 
-    if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
+    if (samples >= options.max_samples ||
+        iterations >= options.max_iterations) {
       break;
-    } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
+    } else if (samples >= options.min_samples &&
+               change_ratio < options.epsilon) {
       break;
     }
 
-    Iterations *= Options.ScalingFactor;
+    iterations *= options.scaling_factor;
   }
-  Result.Cycles = BestGuess;
-  Result.Samples = Samples;
-  Result.TotalIterations = TotalIterations;
-  return Result;
+  result.cycles = best_guess;
+  result.samples = samples;
+  result.total_iterations = total_iterations;
+  return result;
 };
 
 } // namespace libc_gpu_benchmarks
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index ccbbe3629dbda..3d762631f2d96 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -1,9 +1,10 @@
 #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
 #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
 
-#include "benchmarks/gpu/timing/timing.h"
-
 #include "benchmarks/gpu/BenchmarkLogger.h"
+#include "benchmarks/gpu/timing/timing.h"
+#include "src/__support/CPP/functional.h"
+#include "src/__support/CPP/string_view.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -13,104 +14,102 @@ namespace LIBC_NAMESPACE {
 namespace libc_gpu_benchmarks {
 
 struct BenchmarkOptions {
-  uint32_t InitialIterations = 1;
-  uint32_t MaxIterations = 10000000;
-  uint32_t MinSamples = 4;
-  uint32_t MaxSamples = 1000;
-  double Epsilon = 0.01;
-  double ScalingFactor = 1.4;
+  uint32_t initial_iterations = 1;
+  uint32_t max_iterations = 10000000;
+  uint32_t min_samples = 4;
+  uint32_t max_samples = 1000;
+  double epsilon = 0.01;
+  double scaling_factor = 1.4;
 };
 
 struct Measurement {
-  size_t Iterations = 0;
-  uint64_t ElapsedCycles = 0;
+  size_t iterations = 0;
+  uint64_t elapsed_cycles = 0;
 };
 
 class RefinableRuntimeEstimation {
-  uint64_t TotalCycles = 0;
-  size_t TotalIterations = 0;
+  uint64_t total_cycles = 0;
+  size_t total_iterations = 0;
 
 public:
-  uint64_t Update(const Measurement &M) {
-    TotalCycles += M.ElapsedCycles;
-    TotalIterations += M.Iterations;
-    return TotalCycles / TotalIterations;
+  uint64_t update(const Measurement &M) {
+    total_cycles += M.elapsed_cycles;
+    total_iterations += M.iterations;
+    return total_cycles / total_iterations;
   }
 };
 
 // Tracks the progression of the runtime estimation
 class RuntimeEstimationProgression {
-  RefinableRuntimeEstimation RRE;
+  RefinableRuntimeEstimation rre;
 
 public:
-  uint64_t CurrentEstimation = 0;
+  uint64_t current_estimation = 0;
 
-  double ComputeImprovement(const Measurement &M) {
-    const uint64_t NewEstimation = RRE.Update(M);
-    double Ratio = ((double)CurrentEstimation / NewEstimation) - 1.0;
+  double compute_improvement(const Measurement &M) {
+    const uint64_t new_estimation = rre.update(M);
+    double ratio = ((double)current_estimation / new_estimation) - 1.0;
 
     // Get absolute value
-    if (Ratio < 0) {
-      Ratio *= -1;
-    }
+    if (ratio < 0)
+      ratio *= -1;
 
-    CurrentEstimation = NewEstimation;
-    return Ratio;
+    current_estimation = new_estimation;
+    return ratio;
   }
 };
 
 struct BenchmarkResult {
-  uint64_t Cycles = 0;
-  size_t Samples = 0;
-  size_t TotalIterations = 0;
+  uint64_t cycles = 0;
+  size_t samples = 0;
+  size_t total_iterations = 0;
 };
 
-BenchmarkResult benchmark(const BenchmarkOptions &Options,
-                          uint64_t (*WrapperFunc)());
+BenchmarkResult benchmark(const BenchmarkOptions &options,
+                          cpp::function<uint64_t(void)> wrapper_func);
 
 class Benchmark {
-  Benchmark *Next = nullptr;
+  Benchmark *next = nullptr;
 
 public:
   virtual ~Benchmark() {}
-  virtual void SetUp() {}
-  virtual void TearDown() {}
+  virtual void set_up() {}
+  virtual void tear_down() {}
 
-  static int runBenchmarks();
+  static int run_benchmarks();
 
 protected:
-  static void addBenchmark(Benchmark *);
+  static void add_benchmark(Benchmark *);
 
 private:
-  virtual void Run() = 0;
-  virtual const char *getName() const = 0;
+  virtual void run() = 0;
+  virtual const cpp::string_view get_name() const = 0;
 
-  static Benchmark *Start;
-  static Benchmark *End;
+  static Benchmark *start;
+  static Benchmark *end;
 };
 
 class WrapperBenchmark : public Benchmark {
-  using BenchmarkWrapperFunction = uint64_t (*)();
-  BenchmarkWrapperFunction Func;
-  const char *Name;
+  const cpp::function<uint64_t(void)> func;
+  const cpp::string_view name;
 
 public:
-  WrapperBenchmark(BenchmarkWrapperFunction Func, char const *Name)
-      : Func(Func), Name(Name) {
-    addBenchmark(this);
+  WrapperBenchmark(cpp::function<uint64_t(void)> func, char const *name)
+      : func(func), name(name) {
+    add_benchmark(this);
   }
 
 private:
-  void Run() override {
-    BenchmarkOptions Options;
-    auto result = benchmark(Options, Func);
+  void run() override {
+    BenchmarkOptions options;
+    auto result = benchmark(options, func);
     constexpr auto GREEN = "\033[32m";
     constexpr auto RESET = "\033[0m";
-    blog << GREEN << "[ RUN      ] " << RESET << Name << '\n';
-    blog << GREEN << "[       OK ] " << RESET << Name << ": " << result.Cycles
-         << " cycles, " << result.TotalIterations << " iterations\n";
+    blog << GREEN << "[ RUN      ] " << RESET << name << '\n';
+    blog << GREEN << "[       OK ] " << RESET << name << ": " << result.cycles
+         << " cycles, " << result.total_iterations << " iterations\n";
   }
-  const char *getName() const override { return Name; }
+  const cpp::string_view get_name() const override { return name; }
 };
 } // namespace libc_gpu_benchmarks
 } // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
index c971b00cc9a1b..510fd13210494 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
@@ -1,6 +1,6 @@
 #include "LibcGpuBenchmark.h"
 
 extern "C" int main(int argc, char **argv, char **envp) {
-  LIBC_NAMESPACE::libc_gpu_benchmarks::Benchmark::runBenchmarks();
+  LIBC_NAMESPACE::libc_gpu_benchmarks::Benchmark::run_benchmarks();
   return 0;
 }

>From 5c46009bbbddd2114a11fabd2e3afbebed7488f7 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Fri, 17 May 2024 16:22:23 -0400
Subject: [PATCH 04/18] measure walltime, standard deviation, min, and max

---
 libc/benchmarks/gpu/CMakeLists.txt            |  7 +++++
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp      | 31 ++++++++++++++++---
 libc/benchmarks/gpu/LibcGpuBenchmark.h        | 12 ++++++-
 .../gpu/src/ctype/isalnum_benchmark.cpp       | 13 --------
 libc/benchmarks/gpu/timing/nvptx/timing.h     | 13 +++-----
 5 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index db2953f6fcf23..b9ca85393cc2e 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -10,6 +10,9 @@ function(add_benchmark benchmark_name)
     "" # Multi-value arguments
     ${ARGN}
   )
+  if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
+    message(FATAL_ERROR "target does not support clock")
+  endif()
   add_libc_hermetic_test(
     ${benchmark_name}
     IS_BENCHMARK
@@ -38,10 +41,14 @@ add_unittest_framework_library(
     libc.src.__support.CPP.string_view
     libc.src.__support.CPP.type_traits
     libc.src.__support.CPP.functional
+    libc.src.__support.CPP.limits
+    libc.src.__support.CPP.algorithm
     libc.src.__support.fixed_point.fx_rep
     libc.src.__support.macros.properties.types
     libc.src.__support.OSUtil.osutil
     libc.src.__support.uint128
+    libc.src.___support.FPUtil.sqrt
+    libc.src.time.clock
     libc.benchmarks.gpu.timing.timing
 )
 
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 087b59689d90b..3ecff18884b34 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -1,4 +1,7 @@
 #include "LibcGpuBenchmark.h"
+#include "src/__support/CPP/algorithm.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/time/gpu/time_utils.h"
 
 namespace LIBC_NAMESPACE {
 namespace libc_gpu_benchmarks {
@@ -32,27 +35,42 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
     iterations = 1;
 
   size_t samples = 0;
+  uint64_t total_time = 0;
   uint64_t best_guess = 0;
   uint64_t total_cycles = 0;
+  uint64_t cycles_2 = 0;
+  uint64_t min = UINT_MAX;
+  uint64_t max = 0;
   for (;;) {
     uint64_t sample_cycles = 0;
     uint64_t overhead = LIBC_NAMESPACE::overhead();
+    const clock_t start = (double)clock();
     for (uint32_t i = 0; i < iterations; i++) {
-      uint64_t result = wrapper_func() - overhead;
+      auto wrapper_intermediate = wrapper_func();
+      uint64_t result = wrapper_intermediate - overhead;
+      max = cpp::max(max, result);
+      min = cpp::min(min, result);
       sample_cycles += result;
     }
-
+    const clock_t end = clock();
+    const clock_t duration_ns =
+        ((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
+    total_time += duration_ns;
     samples++;
     total_cycles += sample_cycles;
+    cycles_2 += sample_cycles * sample_cycles;
+
     total_iterations += iterations;
     const double change_ratio =
         rep.compute_improvement({iterations, sample_cycles});
     best_guess = rep.current_estimation;
 
     if (samples >= options.max_samples ||
-        iterations >= options.max_iterations) {
+        iterations >= options.max_iterations ||
+        total_time >= options.max_duration) {
       break;
-    } else if (samples >= options.min_samples &&
+    } else if (total_time >= options.min_duration &&
+               samples >= options.min_samples &&
                change_ratio < options.epsilon) {
       break;
     }
@@ -60,8 +78,13 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
     iterations *= options.scaling_factor;
   }
   result.cycles = best_guess;
+  result.standard_deviation = fputil::sqrt((double)cycles_2 / total_iterations -
+                                           (best_guess * best_guess));
+  result.min = min;
+  result.max = max;
   result.samples = samples;
   result.total_iterations = total_iterations;
+  result.total_time = total_time;
   return result;
 };
 
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 3d762631f2d96..798ae06086b1a 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -4,7 +4,9 @@
 #include "benchmarks/gpu/BenchmarkLogger.h"
 #include "benchmarks/gpu/timing/timing.h"
 #include "src/__support/CPP/functional.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/string_view.h"
+#include "src/time/clock.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -18,6 +20,8 @@ struct BenchmarkOptions {
   uint32_t max_iterations = 10000000;
   uint32_t min_samples = 4;
   uint32_t max_samples = 1000;
+  uint64_t min_duration = 0;                  // in nanoseconds (ns)
+  uint64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second
   double epsilon = 0.01;
   double scaling_factor = 1.4;
 };
@@ -61,8 +65,12 @@ class RuntimeEstimationProgression {
 
 struct BenchmarkResult {
   uint64_t cycles = 0;
+  double standard_deviation = 0;
+  uint64_t min = UINT_MAX;
+  uint64_t max = 0;
   size_t samples = 0;
   size_t total_iterations = 0;
+  clock_t total_time = 0;
 };
 
 BenchmarkResult benchmark(const BenchmarkOptions &options,
@@ -107,7 +115,9 @@ class WrapperBenchmark : public Benchmark {
     constexpr auto RESET = "\033[0m";
     blog << GREEN << "[ RUN      ] " << RESET << name << '\n';
     blog << GREEN << "[       OK ] " << RESET << name << ": " << result.cycles
-         << " cycles, " << result.total_iterations << " iterations\n";
+         << " cycles, " << result.min << " min, " << result.max << " max, "
+         << result.total_iterations << " iterations, " << result.total_time
+         << " ns, " << (long)result.standard_deviation << " stddev\n";
   }
   const cpp::string_view get_name() const override { return name; }
 };
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
index 8d9c958bb7ed4..4050bc0ec77b9 100644
--- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -7,16 +7,3 @@ uint64_t BM_IsAlnum() {
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
 }
 BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper, BM_IsAlnum);
-
-[[gnu::noinline]] static uint64_t single_input_function(int x) {
-  asm volatile("" ::"r"(x)); // prevent the compiler from optimizing out x
-  return x;
-}
-
-uint64_t BM_IsAlnumWithOverhead() {
-  char x = 'c';
-  return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x) -
-         LIBC_NAMESPACE::latency(single_input_function, 0);
-}
-BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWithOverhead,
-          BM_IsAlnumWithOverhead);
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 008432e6aa1d2..001bdd3686062 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -37,12 +37,7 @@ namespace LIBC_NAMESPACE {
 
 // Stimulate a simple function and obtain its latency in clock cycles on the
 // system. This function cannot be inlined or else it will disturb the very
-// deliccate balance of hard-coded dependencies.
-//
-// FIXME: This does not work in general on NVPTX because of further
-// optimizations ptxas performs. The only way to get consistent results is to
-// pass and extra "SHELL:-Xcuda-ptxas -O0" to CMake's compiler flag. This
-// negatively implacts performance but it is at least stable.
+// delicate balance of hard-coded dependencies.
 template <typename F, typename T>
 [[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
   // We need to store the input somewhere to guarantee that the compiler will
@@ -53,7 +48,7 @@ template <typename F, typename T>
 
   // Get the current timestamp from the clock.
   gpu::sync_threads();
-  __nvvm_membar_sys();
+  gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
   // This forces the compiler to load the input argument and run the clock cycle
@@ -70,7 +65,7 @@ template <typename F, typename T>
   // Obtain the current timestamp after running the calculation and force
   // ordering.
   uint64_t stop = gpu::processor_clock();
-  __nvvm_membar_sys();
+  gpu::memory_fence();
   gpu::sync_threads();
   asm volatile("" ::"r"(stop));
   volatile T output = result;
@@ -88,6 +83,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   asm volatile("" ::"r"(arg), "r"(arg2));
 
   gpu::sync_threads();
+  gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
   asm volatile("" ::"r"(arg), "r"(arg2), "llr"(start));
@@ -97,6 +93,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
 
   uint64_t stop = gpu::processor_clock();
+  gpu::memory_fence();
   gpu::sync_threads();
   asm volatile("" ::"r"(stop));
   volatile auto output = result;

>From e50ea99befc4279ea1987c47cf5084a55f2f8a47 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 22 May 2024 16:08:39 -0400
Subject: [PATCH 05/18] fixed vector for benchmarks

---
 libc/benchmarks/CMakeLists.txt           | 114 ++++++++++++-----------
 libc/benchmarks/gpu/CMakeLists.txt       |   1 +
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp |  16 +---
 libc/benchmarks/gpu/LibcGpuBenchmark.h   |   4 +-
 4 files changed, 65 insertions(+), 70 deletions(-)

diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 8b51511e3b5cf..221d4e11d383d 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -1,14 +1,14 @@
 if(LIBC_TARGET_OS_IS_GPU)
-	add_subdirectory(gpu)
-	return()
+  add_subdirectory(gpu)
+  return()
 endif()
 
 find_package(Threads)
 
 set(LLVM_LINK_COMPONENTS
-	Support
-	TargetParser
-	)
+  Support
+  TargetParser
+)
 
 #==============================================================================
 # Add Unit Testing Support
@@ -16,35 +16,37 @@ set(LLVM_LINK_COMPONENTS
 
 function(add_libc_benchmark_unittest target_name)
 	if(NOT LLVM_INCLUDE_TESTS)
-	return()
+    return()
 	endif()
 
-	cmake_parse_arguments(
-	"LIBC_BENCHMARKS_UNITTEST"
-	"" # No optional arguments
-	"SUITE" # Single value arguments
-	"SRCS;DEPENDS" # Multi-value arguments
-	${ARGN}
+	cmake_parse_arguments(if(LIBC_TARGET_OS_IS_GPU)
+  add_subdirectory(gpu)
+  return()
+    "LIBC_BENCHMARKS_UNITTEST"
+    "" # No optional arguments
+    "SUITE" # Single value arguments
+    "SRCS;DEPENDS" # Multi-value arguments
+    ${ARGN}
 	)
 
-	add_executable(${target_name}
-	EXCLUDE_FROM_ALL
-	${LIBC_BENCHMARKS_UNITTEST_SRCS}
-	)
-	target_link_libraries(${target_name}
-	PRIVATE
-	llvm_gtest_main
-	llvm_gtest
-	${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
-	)
-	llvm_update_compile_flags(${target_name})
-
-	add_custom_command(
-	TARGET ${target_name}
-	POST_BUILD
-	COMMAND $<TARGET_FILE:${target_name}>
-	)
-	add_dependencies(libc-benchmark-util-tests ${target_name})
+  add_executable(${target_name}
+    EXCLUDE_FROM_ALL
+    ${LIBC_BENCHMARKS_UNITTEST_SRCS}
+  )
+  target_link_libraries(${target_name}
+    PRIVATE
+    llvm_gtest_main
+    llvm_gtest
+    ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
+  )
+  llvm_update_compile_flags(${target_name})
+
+  add_custom_command(
+    TARGET ${target_name}
+    POST_BUILD
+    COMMAND $<TARGET_FILE:${target_name}>
+  )
+  add_dependencies(libc-benchmark-util-tests ${target_name})
 endfunction()
 
 #==============================================================================
@@ -53,32 +55,32 @@ endfunction()
 
 include(ExternalProject)
 ExternalProject_Add(google-benchmark-libc
-	EXCLUDE_FROM_ALL ON
-	PREFIX google-benchmark-libc
-	SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
-	INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
-	CMAKE_CACHE_ARGS
-		-DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
-		-DBENCHMARK_ENABLE_LTO:BOOL=OFF
-		-DBENCHMARK_ENABLE_TESTING:BOOL=OFF
-		-DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
-		-DBENCHMARK_FORCE_WERROR:BOOL=OFF
-		-DBENCHMARK_USE_LIBCXX:BOOL=OFF
-		-DCMAKE_BUILD_TYPE:STRING=Release
-
-		-DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
-		-DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
-		-DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-		-DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-		-DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
-		-DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
-
-		-DBUILD_SHARED_LIBS:BOOL=OFF
-		-DCMAKE_EXE_LINKER_FLAGS:STRING=-static
-
-		-DCMAKE_CXX_STANDARD:STRING=14
-		-DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
-	)
+  EXCLUDE_FROM_ALL ON
+  PREFIX google-benchmark-libc
+  SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
+  INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
+  CMAKE_CACHE_ARGS
+    -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
+    -DBENCHMARK_ENABLE_LTO:BOOL=OFF
+    -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
+    -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
+    -DBENCHMARK_FORCE_WERROR:BOOL=OFF
+    -DBENCHMARK_USE_LIBCXX:BOOL=OFF
+    -DCMAKE_BUILD_TYPE:STRING=Release
+
+    -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
+    -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
+    -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
+    -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
+    -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
+    -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
+
+    -DBUILD_SHARED_LIBS:BOOL=OFF
+    -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
+
+    -DCMAKE_CXX_STANDARD:STRING=14
+    -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
+  )
 
 add_custom_target(libc-benchmark-util-tests)
 
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index b9ca85393cc2e..51fc267df807d 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -48,6 +48,7 @@ add_unittest_framework_library(
     libc.src.__support.OSUtil.osutil
     libc.src.__support.uint128
     libc.src.___support.FPUtil.sqrt
+    libc.src.__support.fixedvector
     libc.src.time.clock
     libc.benchmarks.gpu.timing.timing
 )
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 3ecff18884b34..f8021c873242f 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -6,22 +6,16 @@
 namespace LIBC_NAMESPACE {
 namespace libc_gpu_benchmarks {
 
-Benchmark *Benchmark::start = nullptr;
-Benchmark *Benchmark::end = nullptr;
+FixedVector<Benchmark *, 64> benchmarks_to_run;
 
 void Benchmark::add_benchmark(Benchmark *benchmark) {
-  if (end == nullptr) {
-    start = benchmark;
-    end = benchmark;
-    return;
-  }
-  end->next = benchmark;
-  end = benchmark;
+  benchmarks_to_run.push_back(benchmark);
 }
 
 int Benchmark::run_benchmarks() {
-  for (Benchmark *b = start; b != nullptr; b = b->next)
-    b->run();
+  for (auto it = benchmarks_to_run.rbegin(), e = benchmarks_to_run.rend();
+       it != e; ++it)
+    (*it)->run();
   return 0;
 }
 
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 798ae06086b1a..459e4d9b6ea98 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -6,6 +6,7 @@
 #include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/fixedvector.h"
 #include "src/time/clock.h"
 
 #include <stddef.h>
@@ -92,9 +93,6 @@ class Benchmark {
 private:
   virtual void run() = 0;
   virtual const cpp::string_view get_name() const = 0;
-
-  static Benchmark *start;
-  static Benchmark *end;
 };
 
 class WrapperBenchmark : public Benchmark {

>From a588fc5b2eac6e84dd0dc4f62bebfc428a695845 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 22 May 2024 16:27:44 -0400
Subject: [PATCH 06/18] refactor cmake files

---
 libc/benchmarks/CMakeLists.txt               | 188 +++++++++----------
 libc/benchmarks/gpu/CMakeLists.txt           |   3 +-
 libc/benchmarks/gpu/src/ctype/CMakeLists.txt |  28 +--
 libc/cmake/modules/LLVMLibCTestRules.cmake   |  27 +--
 4 files changed, 122 insertions(+), 124 deletions(-)

diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 221d4e11d383d..0234ccb2a7a78 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -8,26 +8,24 @@ find_package(Threads)
 set(LLVM_LINK_COMPONENTS
   Support
   TargetParser
-)
+  )
 
 #==============================================================================
 # Add Unit Testing Support
 #==============================================================================
 
 function(add_libc_benchmark_unittest target_name)
-	if(NOT LLVM_INCLUDE_TESTS)
+  if(NOT LLVM_INCLUDE_TESTS)
     return()
-	endif()
+  endif()
 
-	cmake_parse_arguments(if(LIBC_TARGET_OS_IS_GPU)
-  add_subdirectory(gpu)
-  return()
+  cmake_parse_arguments(
     "LIBC_BENCHMARKS_UNITTEST"
     "" # No optional arguments
     "SUITE" # Single value arguments
     "SRCS;DEPENDS" # Multi-value arguments
     ${ARGN}
-	)
+  )
 
   add_executable(${target_name}
     EXCLUDE_FROM_ALL
@@ -55,99 +53,99 @@ endfunction()
 
 include(ExternalProject)
 ExternalProject_Add(google-benchmark-libc
-  EXCLUDE_FROM_ALL ON
-  PREFIX google-benchmark-libc
-  SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
-  INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
-  CMAKE_CACHE_ARGS
-    -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
-    -DBENCHMARK_ENABLE_LTO:BOOL=OFF
-    -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
-    -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
-    -DBENCHMARK_FORCE_WERROR:BOOL=OFF
-    -DBENCHMARK_USE_LIBCXX:BOOL=OFF
-    -DCMAKE_BUILD_TYPE:STRING=Release
-
-    -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
-    -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
-    -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-    -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-    -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
-    -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
-
-    -DBUILD_SHARED_LIBS:BOOL=OFF
-    -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
-
-    -DCMAKE_CXX_STANDARD:STRING=14
-    -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
-  )
+        EXCLUDE_FROM_ALL ON
+        PREFIX google-benchmark-libc
+        SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
+        INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
+        CMAKE_CACHE_ARGS
+          -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
+          -DBENCHMARK_ENABLE_LTO:BOOL=OFF
+          -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
+          -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
+          -DBENCHMARK_FORCE_WERROR:BOOL=OFF
+          -DBENCHMARK_USE_LIBCXX:BOOL=OFF
+          -DCMAKE_BUILD_TYPE:STRING=Release
+
+          -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
+          -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
+          -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
+          -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
+          -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
+          -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
+
+          -DBUILD_SHARED_LIBS:BOOL=OFF
+          -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
+
+          -DCMAKE_CXX_STANDARD:STRING=14
+          -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
+        )
 
 add_custom_target(libc-benchmark-util-tests)
 
 # libc-benchmark
 add_library(libc-benchmark
-	STATIC
-	EXCLUDE_FROM_ALL
-	LibcBenchmark.cpp
-	LibcBenchmark.h
+    STATIC
+    EXCLUDE_FROM_ALL
+    LibcBenchmark.cpp
+    LibcBenchmark.h
 )
 
 target_include_directories(libc-benchmark
-	PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
+    PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
 )
 target_link_libraries(libc-benchmark
-	PUBLIC
-	benchmark::benchmark
-		LLVMSupport
-	LLVMTargetParser
-	Threads::Threads
+    PUBLIC
+    benchmark::benchmark
+    LLVMSupport
+    LLVMTargetParser
+    Threads::Threads
 )
 add_dependencies(libc-benchmark google-benchmark-libc)
 llvm_update_compile_flags(libc-benchmark)
 
 add_libc_benchmark_unittest(libc-benchmark-test
-	SRCS LibcBenchmarkTest.cpp
-	DEPENDS libc-benchmark
+    SRCS LibcBenchmarkTest.cpp
+    DEPENDS libc-benchmark
 )
 
 # libc-memory-benchmark
 add_library(libc-memory-benchmark
-	STATIC
-	EXCLUDE_FROM_ALL
-	LibcMemoryBenchmark.cpp
-	LibcMemoryBenchmark.h
-	LibcFunctionPrototypes.h
-	MemorySizeDistributions.cpp
-	MemorySizeDistributions.h
+    STATIC
+    EXCLUDE_FROM_ALL
+    LibcMemoryBenchmark.cpp
+    LibcMemoryBenchmark.h
+    LibcFunctionPrototypes.h
+    MemorySizeDistributions.cpp
+    MemorySizeDistributions.h
 )
 target_include_directories(libc-memory-benchmark
-	PUBLIC
-	${CMAKE_CURRENT_SOURCE_DIR}
+    PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}
 )
 target_link_libraries(libc-memory-benchmark
-	PUBLIC
-	libc-benchmark
+    PUBLIC
+    libc-benchmark
 )
 llvm_update_compile_flags(libc-memory-benchmark)
 
 add_libc_benchmark_unittest(libc-memory-benchmark-test
-	SRCS LibcMemoryBenchmarkTest.cpp
-	DEPENDS libc-memory-benchmark
+    SRCS LibcMemoryBenchmarkTest.cpp
+    DEPENDS libc-memory-benchmark
 )
 
 # json
 add_library(json
-	STATIC
-	EXCLUDE_FROM_ALL
-	JSON.cpp
-	JSON.h
+    STATIC
+    EXCLUDE_FROM_ALL
+    JSON.cpp
+    JSON.h
 )
 target_link_libraries(json PUBLIC libc-memory-benchmark)
 llvm_update_compile_flags(json)
 
 add_libc_benchmark_unittest(json-test
-	SRCS JSONTest.cpp
-	DEPENDS json
+    SRCS JSONTest.cpp
+    DEPENDS json
 )
 
 #==============================================================================
@@ -156,25 +154,25 @@ add_libc_benchmark_unittest(json-test
 
 # Benchmark all implementations that can run on the target CPU.
 function(add_libc_multi_impl_benchmark name)
-	get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
-	foreach(fq_config_name IN LISTS fq_implementations)
-	get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
-	cpu_supports(can_run "${required_cpu_features}")
-	if(can_run)
-	set(benchmark_name ${fq_config_name}_benchmark)
-	add_executable(${benchmark_name}
-		EXCLUDE_FROM_ALL
-		LibcMemoryBenchmarkMain.cpp
-	)
-	get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
-	target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
-	string(TOUPPER ${name} name_upper)
-	target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
-	llvm_update_compile_flags(${benchmark_name})
-	else()
-		message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
-	endif()
-	endforeach()
+  get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
+  foreach(fq_config_name IN LISTS fq_implementations)
+    get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
+    cpu_supports(can_run "${required_cpu_features}")
+    if(can_run)
+        set(benchmark_name ${fq_config_name}_benchmark)
+        add_executable(${benchmark_name}
+            EXCLUDE_FROM_ALL
+            LibcMemoryBenchmarkMain.cpp
+        )
+        get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
+        target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
+        string(TOUPPER ${name} name_upper)
+        target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
+        llvm_update_compile_flags(${benchmark_name})
+    else()
+      message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
+    endif()
+  endforeach()
 endfunction()
 
 add_libc_multi_impl_benchmark(bcmp)
@@ -192,20 +190,20 @@ add_libc_multi_impl_benchmark(memset)
 # libc memory functions compiled for the host machine. This is useful to
 # continuously monitor the performance of the memory functions.
 add_executable(libc.benchmarks.memory_functions.opt_host
-	EXCLUDE_FROM_ALL
-	LibcMemoryGoogleBenchmarkMain.cpp
-	LibcDefaultImplementations.cpp
+  EXCLUDE_FROM_ALL
+  LibcMemoryGoogleBenchmarkMain.cpp
+  LibcDefaultImplementations.cpp
 )
 target_link_libraries(libc.benchmarks.memory_functions.opt_host
-	PRIVATE
-	libc-memory-benchmark
-	libc.src.string.memcmp_opt_host.__internal__
-	libc.src.string.bcmp_opt_host.__internal__
-	libc.src.string.memcpy_opt_host.__internal__
-	libc.src.string.memset_opt_host.__internal__
-	libc.src.string.bzero_opt_host.__internal__
-	libc.src.string.memmove_opt_host.__internal__
-	benchmark_main
+  PRIVATE
+  libc-memory-benchmark
+  libc.src.string.memcmp_opt_host.__internal__
+  libc.src.string.bcmp_opt_host.__internal__
+  libc.src.string.memcpy_opt_host.__internal__
+  libc.src.string.memset_opt_host.__internal__
+  libc.src.string.bzero_opt_host.__internal__
+  libc.src.string.memmove_opt_host.__internal__
+  benchmark_main
 )
 llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host)
 
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 51fc267df807d..9ed45eedc402e 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -13,9 +13,8 @@ function(add_benchmark benchmark_name)
   if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
     message(FATAL_ERROR "target does not support clock")
   endif()
-  add_libc_hermetic_test(
+  add_libc_hermetic(
     ${benchmark_name}
-    IS_BENCHMARK
     LINK_LIBRARIES
       LibcGpuBenchmark.hermetic
       ${BENCHMARK_LINK_LIBRARIES}
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
index 8d448b8ced955..79f01425770da 100644
--- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -1,21 +1,21 @@
 add_custom_target(libc-gpu-ctype-benchmarks)
 
 add_benchmark(
-	isalnum_benchmark
-	SUITE
-		libc-gpu-ctype-benchmarks
-	SRCS
-		isalnum_benchmark.cpp
-	DEPENDS
-		libc.src.ctype.isalnum
+  isalnum_benchmark
+  SUITE
+    libc-gpu-ctype-benchmarks
+  SRCS
+    isalnum_benchmark.cpp
+  DEPENDS
+    libc.src.ctype.isalnum
 )
 
 add_benchmark(
-	isalpha_benchmark
-	SUITE
-		libc-gpu-ctype-benchmarks
-	SRCS
-		isalpha_benchmark.cpp
-	DEPENDS
-		libc.src.ctype.isalpha
+  isalpha_benchmark
+  SUITE
+    libc-gpu-ctype-benchmarks
+  SRCS
+    isalpha_benchmark.cpp
+  DEPENDS
+    libc.src.ctype.isalpha
 )
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 278137774e089..508694ae9fc01 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -526,12 +526,15 @@ function(add_integration_test test_name)
   add_dependencies(${INTEGRATION_TEST_SUITE} ${fq_target_name})
 endfunction(add_integration_test)
 
-# Rule to add a hermetic test. A hermetic test is one whose executable is fully
+# Rule to add a hermetic program. A hermetic program is one whose executable is fully
 # statically linked and consists of pieces drawn only from LLVM's libc. Nothing,
 # including the startup objects, come from the system libc.
 #
+# For the GPU, these can be either tests or benchmarks, depending on the value
+# of the LINK_LIBRARIES arg.
+#
 # Usage:
-#   add_libc_hermetic_test(
+#   add_libc_hermetic(
 #     <target name>
 #     SUITE <the suite to which the test should belong>
 #     SRCS <src1.cpp> [src2.cpp ...]
@@ -543,14 +546,14 @@ endfunction(add_integration_test)
 #     LINK_LIBRARIES <list of linking libraries for this target>
 #     LOADER_ARGS <list of special args to loaders (like the GPU loader)>
 #   )
-function(add_libc_hermetic_test test_name)
+function(add_libc_hermetic test_name)
   if(NOT TARGET libc.startup.${LIBC_TARGET_OS}.crt1)
     message(VERBOSE "Skipping ${fq_target_name} as it is not available on ${LIBC_TARGET_OS}.")
     return()
   endif()
   cmake_parse_arguments(
     "HERMETIC_TEST"
-    "IS_BENCHMARK" # Optional arguments
+    "" # No optional arguments
     "SUITE" # Single value arguments
     "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
     ${ARGN}
@@ -651,13 +654,6 @@ function(add_libc_hermetic_test test_name)
     endif()
   endforeach()
 
-  # Benchmarks requires a separate library with a different `main` function
-  if(HERMETIC_TEST_IS_BENCHMARK)
-    list(APPEND link_libraries LibcGpuBenchmark.hermetic)
-  else()
-    list(APPEND link_libraries LibcTest.hermetic)
-  endif()
-
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT}
@@ -721,7 +717,7 @@ function(add_libc_hermetic_test test_name)
 
   add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name})
   add_dependencies(libc-hermetic-tests ${fq_target_name})
-endfunction(add_libc_hermetic_test)
+endfunction(add_libc_hermetic)
 
 # A convenience function to add both a unit test as well as a hermetic test.
 function(add_libc_test test_name)
@@ -736,7 +732,12 @@ function(add_libc_test test_name)
     add_libc_unittest(${test_name}.__unit__ ${LIBC_TEST_UNPARSED_ARGUMENTS})
   endif()
   if(LIBC_ENABLE_HERMETIC_TESTS AND NOT LIBC_TEST_UNIT_TEST_ONLY)
-    add_libc_hermetic_test(${test_name}.__hermetic__ ${LIBC_TEST_UNPARSED_ARGUMENTS})
+    add_libc_hermetic(
+      ${test_name}.__hermetic__
+      LINK_LIBRARIES
+        LibcTest.hermetic
+      ${LIBC_TEST_UNPARSED_ARGUMENTS}
+    )
     get_fq_target_name(${test_name} fq_test_name)
     if(TARGET ${fq_test_name}.__hermetic__ AND TARGET ${fq_test_name}.__unit__)
       # Tests like the file tests perform file operations on disk file. If we

>From be303da366eb2e7dd42ad12c206268b4259264c3 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Thu, 23 May 2024 12:34:21 -0400
Subject: [PATCH 07/18] rename namespace and refactor casts

---
 libc/benchmarks/gpu/BenchmarkLogger.cpp      |  4 +-
 libc/benchmarks/gpu/BenchmarkLogger.h        |  4 +-
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp     | 31 +++++------
 libc/benchmarks/gpu/LibcGpuBenchmark.h       | 54 ++++++++------------
 libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp |  2 +-
 5 files changed, 42 insertions(+), 53 deletions(-)

diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
index 4f70d23a1e95e..9a36ee5b3046c 100644
--- a/libc/benchmarks/gpu/BenchmarkLogger.cpp
+++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp
@@ -9,7 +9,7 @@
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE {
-namespace libc_gpu_benchmarks {
+namespace benchmarks {
 
 // cpp::string_view specialization
 template <>
@@ -93,5 +93,5 @@ template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>);
 
 BenchmarkLogger blog;
 
-} // namespace libc_gpu_benchmarks
+} // namespace benchmarks
 } // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h
index ed3cc97e59c6d..98813b28eaa91 100644
--- a/libc/benchmarks/gpu/BenchmarkLogger.h
+++ b/libc/benchmarks/gpu/BenchmarkLogger.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
 
 namespace LIBC_NAMESPACE {
-namespace libc_gpu_benchmarks {
+namespace benchmarks {
 
 // A class to log to standard output in the context of hermetic tests.
 struct BenchmarkLogger {
@@ -21,7 +21,7 @@ struct BenchmarkLogger {
 // A global TestLogger instance to be used in tests.
 extern BenchmarkLogger blog;
 
-} // namespace libc_gpu_benchmarks
+} // namespace benchmarks
 } // namespace LIBC_NAMESPACE
 
 #endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index f8021c873242f..4c49839249d56 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -4,7 +4,7 @@
 #include "src/time/gpu/time_utils.h"
 
 namespace LIBC_NAMESPACE {
-namespace libc_gpu_benchmarks {
+namespace benchmarks {
 
 FixedVector<Benchmark *, 64> benchmarks_to_run;
 
@@ -23,22 +23,22 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
                           cpp::function<uint64_t(void)> wrapper_func) {
   BenchmarkResult result;
   RuntimeEstimationProgression rep;
-  size_t total_iterations = 0;
-  size_t iterations = options.initial_iterations;
-  if (iterations < (uint32_t)1)
+  uint32_t total_iterations = 0;
+  uint32_t iterations = options.initial_iterations;
+  if (iterations < 1u)
     iterations = 1;
 
-  size_t samples = 0;
+  uint32_t samples = 0;
   uint64_t total_time = 0;
   uint64_t best_guess = 0;
   uint64_t total_cycles = 0;
-  uint64_t cycles_2 = 0;
-  uint64_t min = UINT_MAX;
+  uint64_t cycles_squared = 0;
+  uint64_t min = UINT64_MAX;
   uint64_t max = 0;
-  for (;;) {
+  for (uint64_t time_budget = options.max_duration; time_budget >= 0;) {
     uint64_t sample_cycles = 0;
     uint64_t overhead = LIBC_NAMESPACE::overhead();
-    const clock_t start = (double)clock();
+    const clock_t start = static_cast<double>(clock());
     for (uint32_t i = 0; i < iterations; i++) {
       auto wrapper_intermediate = wrapper_func();
       uint64_t result = wrapper_intermediate - overhead;
@@ -50,9 +50,10 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
     const clock_t duration_ns =
         ((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
     total_time += duration_ns;
+    time_budget -= duration_ns;
     samples++;
     total_cycles += sample_cycles;
-    cycles_2 += sample_cycles * sample_cycles;
+    cycles_squared += sample_cycles * sample_cycles;
 
     total_iterations += iterations;
     const double change_ratio =
@@ -60,8 +61,7 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
     best_guess = rep.current_estimation;
 
     if (samples >= options.max_samples ||
-        iterations >= options.max_iterations ||
-        total_time >= options.max_duration) {
+        iterations >= options.max_iterations) {
       break;
     } else if (total_time >= options.min_duration &&
                samples >= options.min_samples &&
@@ -72,8 +72,9 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
     iterations *= options.scaling_factor;
   }
   result.cycles = best_guess;
-  result.standard_deviation = fputil::sqrt((double)cycles_2 / total_iterations -
-                                           (best_guess * best_guess));
+  result.standard_deviation =
+      fputil::sqrt(static_cast<double>(cycles_squared) / total_iterations -
+                   (best_guess * best_guess));
   result.min = min;
   result.max = max;
   result.samples = samples;
@@ -82,5 +83,5 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
   return result;
 };
 
-} // namespace libc_gpu_benchmarks
+} // namespace benchmarks
 } // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 459e4d9b6ea98..20543af66e331 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -9,12 +9,11 @@
 #include "src/__support/fixedvector.h"
 #include "src/time/clock.h"
 
-#include <stddef.h>
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE {
 
-namespace libc_gpu_benchmarks {
+namespace benchmarks {
 
 struct BenchmarkOptions {
   uint32_t initial_iterations = 1;
@@ -28,13 +27,13 @@ struct BenchmarkOptions {
 };
 
 struct Measurement {
-  size_t iterations = 0;
+  uint32_t iterations = 0;
   uint64_t elapsed_cycles = 0;
 };
 
 class RefinableRuntimeEstimation {
   uint64_t total_cycles = 0;
-  size_t total_iterations = 0;
+  uint32_t total_iterations = 0;
 
 public:
   uint64_t update(const Measurement &M) {
@@ -53,7 +52,8 @@ class RuntimeEstimationProgression {
 
   double compute_improvement(const Measurement &M) {
     const uint64_t new_estimation = rre.update(M);
-    double ratio = ((double)current_estimation / new_estimation) - 1.0;
+    double ratio =
+        (static_cast<double>(current_estimation) / new_estimation) - 1.0;
 
     // Get absolute value
     if (ratio < 0)
@@ -67,10 +67,10 @@ class RuntimeEstimationProgression {
 struct BenchmarkResult {
   uint64_t cycles = 0;
   double standard_deviation = 0;
-  uint64_t min = UINT_MAX;
+  uint64_t min = UINT64_MAX;
   uint64_t max = 0;
-  size_t samples = 0;
-  size_t total_iterations = 0;
+  uint32_t samples = 0;
+  uint32_t total_iterations = 0;
   clock_t total_time = 0;
 };
 
@@ -78,35 +78,22 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
                           cpp::function<uint64_t(void)> wrapper_func);
 
 class Benchmark {
-  Benchmark *next = nullptr;
-
-public:
-  virtual ~Benchmark() {}
-  virtual void set_up() {}
-  virtual void tear_down() {}
-
-  static int run_benchmarks();
-
-protected:
-  static void add_benchmark(Benchmark *);
-
-private:
-  virtual void run() = 0;
-  virtual const cpp::string_view get_name() const = 0;
-};
-
-class WrapperBenchmark : public Benchmark {
   const cpp::function<uint64_t(void)> func;
   const cpp::string_view name;
 
 public:
-  WrapperBenchmark(cpp::function<uint64_t(void)> func, char const *name)
+  Benchmark(cpp::function<uint64_t(void)> func, char const *name)
       : func(func), name(name) {
     add_benchmark(this);
   }
 
+  static int run_benchmarks();
+
+protected:
+  static void add_benchmark(Benchmark *benchmark);
+
 private:
-  void run() override {
+  void run() {
     BenchmarkOptions options;
     auto result = benchmark(options, func);
     constexpr auto GREEN = "\033[32m";
@@ -115,15 +102,16 @@ class WrapperBenchmark : public Benchmark {
     blog << GREEN << "[       OK ] " << RESET << name << ": " << result.cycles
          << " cycles, " << result.min << " min, " << result.max << " max, "
          << result.total_iterations << " iterations, " << result.total_time
-         << " ns, " << (long)result.standard_deviation << " stddev\n";
+         << " ns, " << static_cast<long>(result.standard_deviation)
+         << " stddev\n";
   }
-  const cpp::string_view get_name() const override { return name; }
+  const cpp::string_view get_name() const { return name; }
 };
-} // namespace libc_gpu_benchmarks
+} // namespace benchmarks
 } // namespace LIBC_NAMESPACE
 
 #define BENCHMARK(SuiteName, TestName, Func)                                   \
-  LIBC_NAMESPACE::libc_gpu_benchmarks::WrapperBenchmark                        \
-      SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
+  LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
+      Func, #SuiteName "." #TestName);
 
 #endif
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
index 510fd13210494..97366e55194a9 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
@@ -1,6 +1,6 @@
 #include "LibcGpuBenchmark.h"
 
 extern "C" int main(int argc, char **argv, char **envp) {
-  LIBC_NAMESPACE::libc_gpu_benchmarks::Benchmark::run_benchmarks();
+  LIBC_NAMESPACE::benchmarks::Benchmark::run_benchmarks();
   return 0;
 }

>From a41eb326ad5d4d99ba24ab17c5ae55ce022b43af Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Mon, 27 May 2024 11:32:30 -0400
Subject: [PATCH 08/18] repeat overhead measurment outside of loop

---
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 4c49839249d56..e91d2b400444a 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -35,9 +35,15 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
   uint64_t cycles_squared = 0;
   uint64_t min = UINT64_MAX;
   uint64_t max = 0;
+
+  uint64_t total_overhead_cycles = 0;
+  uint32_t overhead_iterations = 10;
+  for (int i = 0; i < overhead_iterations; i++)
+    total_overhead_cycles += LIBC_NAMESPACE::overhead();
+  uint64_t overhead = total_overhead_cycles / overhead_iterations;
+
   for (uint64_t time_budget = options.max_duration; time_budget >= 0;) {
     uint64_t sample_cycles = 0;
-    uint64_t overhead = LIBC_NAMESPACE::overhead();
     const clock_t start = static_cast<double>(clock());
     for (uint32_t i = 0; i < iterations; i++) {
       auto wrapper_intermediate = wrapper_func();

>From ab6b6cae819bdd2ba7da32292063a7bcd6620e10 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Mon, 27 May 2024 11:37:14 -0400
Subject: [PATCH 09/18] switch to using min measurement for overhead

---
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index e91d2b400444a..f0ba3af23a140 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -36,11 +36,10 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
   uint64_t min = UINT64_MAX;
   uint64_t max = 0;
 
-  uint64_t total_overhead_cycles = 0;
-  uint32_t overhead_iterations = 10;
+  uint64_t overhead = UINT64_MAX;
+  int overhead_iterations = 10;
   for (int i = 0; i < overhead_iterations; i++)
-    total_overhead_cycles += LIBC_NAMESPACE::overhead();
-  uint64_t overhead = total_overhead_cycles / overhead_iterations;
+    overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
 
   for (uint64_t time_budget = options.max_duration; time_budget >= 0;) {
     uint64_t sample_cycles = 0;

>From c7c8445f76fef4923d0607208a621bcd7a8ef58d Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Tue, 28 May 2024 22:46:04 -0400
Subject: [PATCH 10/18] fix style

---
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 19 +++++++------------
 libc/benchmarks/gpu/LibcGpuBenchmark.h   |  2 +-
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index f0ba3af23a140..e4f839e361dd0 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -6,17 +6,15 @@
 namespace LIBC_NAMESPACE {
 namespace benchmarks {
 
-FixedVector<Benchmark *, 64> benchmarks_to_run;
+FixedVector<Benchmark *, 64> benchmarks;
 
 void Benchmark::add_benchmark(Benchmark *benchmark) {
-  benchmarks_to_run.push_back(benchmark);
+  benchmarks.push_back(benchmark);
 }
 
-int Benchmark::run_benchmarks() {
-  for (auto it = benchmarks_to_run.rbegin(), e = benchmarks_to_run.rend();
-       it != e; ++it)
+void Benchmark::run_benchmarks() {
+  for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it)
     (*it)->run();
-  return 0;
 }
 
 BenchmarkResult benchmark(const BenchmarkOptions &options,
@@ -65,14 +63,11 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
         rep.compute_improvement({iterations, sample_cycles});
     best_guess = rep.current_estimation;
 
-    if (samples >= options.max_samples ||
-        iterations >= options.max_iterations) {
+    if (samples >= options.max_samples || iterations >= options.max_iterations)
       break;
-    } else if (total_time >= options.min_duration &&
-               samples >= options.min_samples &&
-               change_ratio < options.epsilon) {
+    if (total_time >= options.min_duration && samples >= options.min_samples &&
+        change_ratio < options.epsilon)
       break;
-    }
 
     iterations *= options.scaling_factor;
   }
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 20543af66e331..08e99dadc8d07 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -87,7 +87,7 @@ class Benchmark {
     add_benchmark(this);
   }
 
-  static int run_benchmarks();
+  static void run_benchmarks();
 
 protected:
   static void add_benchmark(Benchmark *benchmark);

>From c857891c2f30ffba251fcce6be2d647d39a2bf69 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Tue, 28 May 2024 23:07:26 -0400
Subject: [PATCH 11/18] unconditionally add benchmarks in gpu build

---
 libc/CMakeLists.txt            | 4 +---
 libc/benchmarks/CMakeLists.txt | 5 +++++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index f35471a06a53e..4ffcd55ba9500 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -401,9 +401,7 @@ if(LLVM_INCLUDE_TESTS)
   add_subdirectory(fuzzing)
 endif()
 
-if(LIBC_INCLUDE_BENCHMARKS)
-  add_subdirectory(benchmarks)
-endif()
+add_subdirectory(benchmarks)
 
 if (LIBC_INCLUDE_DOCS)
   add_subdirectory(docs)
diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 0234ccb2a7a78..0cff6eb12c247 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -3,6 +3,11 @@ if(LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
+# The CPU build depends on Google benchmark.
+if(NOT LIBC_INCLUDE_BENCHMARKS)
+  return()
+endif()
+
 find_package(Threads)
 
 set(LLVM_LINK_COMPONENTS

>From 6073de7b30620397831ba76b5d587b88e035c14e Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 29 May 2024 22:30:00 -0400
Subject: [PATCH 12/18] add forward iterator

---
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp |  4 +-
 libc/src/__support/CPP/array.h           | 26 +++++----
 libc/src/__support/CPP/iterator.h        | 68 ++++++++++++++++++++++++
 libc/src/__support/fixedvector.h         |  4 ++
 4 files changed, 90 insertions(+), 12 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index e4f839e361dd0..a7a02cacc3305 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -13,8 +13,8 @@ void Benchmark::add_benchmark(Benchmark *benchmark) {
 }
 
 void Benchmark::run_benchmarks() {
-  for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it)
-    (*it)->run();
+  for (Benchmark *benchmark : benchmarks)
+    benchmark->run();
 }
 
 BenchmarkResult benchmark(const BenchmarkOptions &options,
diff --git a/libc/src/__support/CPP/array.h b/libc/src/__support/CPP/array.h
index 4e69ba003e800..7e4cf29847daf 100644
--- a/libc/src/__support/CPP/array.h
+++ b/libc/src/__support/CPP/array.h
@@ -22,10 +22,12 @@ template <class T, size_t N> struct array {
 
   T Data[N];
   using value_type = T;
-  using iterator = T *;
-  using const_iterator = const T *;
-  using reverse_iterator = cpp::reverse_iterator<iterator>;
-  using const_reverse_iterator = cpp::reverse_iterator<const_iterator>;
+  using pointer_type = T *;
+  using iterator = cpp::iterator<pointer_type>;
+  using const_pointer_type = const T *;
+  using const_iterator = cpp::iterator<const_pointer_type>;
+  using reverse_iterator = cpp::reverse_iterator<pointer_type>;
+  using const_reverse_iterator = cpp::reverse_iterator<const_pointer_type>;
 
   LIBC_INLINE constexpr T *data() { return Data; }
   LIBC_INLINE constexpr const T *data() const { return Data; }
@@ -46,12 +48,16 @@ template <class T, size_t N> struct array {
 
   LIBC_INLINE constexpr bool empty() const { return N == 0; }
 
-  LIBC_INLINE constexpr iterator begin() { return Data; }
-  LIBC_INLINE constexpr const_iterator begin() const { return Data; }
+  LIBC_INLINE constexpr iterator begin() { return iterator{Data}; }
+  LIBC_INLINE constexpr const_iterator begin() const {
+    return const_iterator{Data};
+  }
   LIBC_INLINE constexpr const_iterator cbegin() const { return begin(); }
 
-  LIBC_INLINE constexpr iterator end() { return Data + N; }
-  LIBC_INLINE constexpr const_iterator end() const { return Data + N; }
+  LIBC_INLINE constexpr iterator end() { return iterator{Data + N}; }
+  LIBC_INLINE constexpr const_iterator end() const {
+    return const_iterator{Data + N};
+  }
   LIBC_INLINE constexpr const_iterator cend() const { return end(); }
 
   LIBC_INLINE constexpr reverse_iterator rbegin() {
@@ -65,10 +71,10 @@ template <class T, size_t N> struct array {
   }
 
   LIBC_INLINE constexpr reverse_iterator rend() {
-    return reverse_iterator{begin()};
+    return reverse_iterator{Data};
   }
   LIBC_INLINE constexpr const_reverse_iterator rend() const {
-    return const_reverse_iterator{begin()};
+    return const_reverse_iterator{Data};
   }
   LIBC_INLINE constexpr const_reverse_iterator crend() const { return rend(); }
 };
diff --git a/libc/src/__support/CPP/iterator.h b/libc/src/__support/CPP/iterator.h
index b0fd5c9f22ae0..37d631b01582e 100644
--- a/libc/src/__support/CPP/iterator.h
+++ b/libc/src/__support/CPP/iterator.h
@@ -92,6 +92,74 @@ template <typename Iter> class reverse_iterator {
   }
 };
 
+template <typename Iter> class iterator {
+  Iter current;
+
+public:
+  using reference = typename iterator_traits<Iter>::reference;
+  using value_type = typename iterator_traits<Iter>::value_type;
+  using iterator_type = Iter;
+
+  LIBC_INLINE iterator() : current() {}
+  LIBC_INLINE constexpr explicit iterator(Iter it) : current(it) {}
+
+  template <typename Other,
+            cpp::enable_if_t<!cpp::is_same_v<Iter, Other> &&
+                                 cpp::is_convertible_v<const Other &, Iter>,
+                             int> = 0>
+  LIBC_INLINE constexpr explicit iterator(const Other &it) : current(it) {}
+
+  LIBC_INLINE friend constexpr bool operator==(const iterator &lhs,
+                                               const iterator &rhs) {
+    return lhs.base() == rhs.base();
+  }
+
+  LIBC_INLINE friend constexpr bool operator!=(const iterator &lhs,
+                                               const iterator &rhs) {
+    return lhs.base() != rhs.base();
+  }
+
+  LIBC_INLINE friend constexpr bool operator<(const iterator &lhs,
+                                              const iterator &rhs) {
+    return lhs.base() < rhs.base();
+  }
+
+  LIBC_INLINE friend constexpr bool operator<=(const iterator &lhs,
+                                               const iterator &rhs) {
+    return lhs.base() <= rhs.base();
+  }
+
+  LIBC_INLINE friend constexpr bool operator>(const iterator &lhs,
+                                              const iterator &rhs) {
+    return lhs.base() > rhs.base();
+  }
+
+  LIBC_INLINE friend constexpr bool operator>=(const iterator &lhs,
+                                               const iterator &rhs) {
+    return lhs.base() >= rhs.base();
+  }
+
+  LIBC_INLINE constexpr iterator_type base() const { return current; }
+
+  LIBC_INLINE constexpr reference operator*() const {
+    Iter tmp = current;
+    return *tmp;
+  }
+  LIBC_INLINE constexpr iterator operator--() {
+    --current;
+    return *this;
+  }
+  LIBC_INLINE constexpr iterator &operator++() {
+    ++current;
+    return *this;
+  }
+  LIBC_INLINE constexpr iterator operator++(int) {
+    iterator tmp(*this);
+    ++current;
+    return tmp;
+  }
+};
+
 } // namespace cpp
 } // namespace LIBC_NAMESPACE
 
diff --git a/libc/src/__support/fixedvector.h b/libc/src/__support/fixedvector.h
index 403b1620d20df..ef00fede07366 100644
--- a/libc/src/__support/fixedvector.h
+++ b/libc/src/__support/fixedvector.h
@@ -82,6 +82,10 @@ template <typename T, size_t CAPACITY> class FixedVector {
   // can easily swap one data structure for the other.
   static void destroy(FixedVector<T, CAPACITY> *store) { store->reset(); }
 
+  using iterator = typename cpp::array<T, CAPACITY>::iterator;
+  LIBC_INLINE constexpr iterator begin() { return iterator{&store[0]}; }
+  LIBC_INLINE constexpr iterator end() { return iterator{&store[item_count]}; }
+
   using reverse_iterator = typename cpp::array<T, CAPACITY>::reverse_iterator;
   LIBC_INLINE constexpr reverse_iterator rbegin() {
     return reverse_iterator{&store[item_count]};

>From 9f23d216e4b98aedf4f6bc2a56b70f05edb9b6a4 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 29 May 2024 22:40:02 -0400
Subject: [PATCH 13/18] rename logger

---
 libc/benchmarks/gpu/BenchmarkLogger.cpp |  2 +-
 libc/benchmarks/gpu/BenchmarkLogger.h   |  2 +-
 libc/benchmarks/gpu/LibcGpuBenchmark.h  | 12 ++++++------
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
index 9a36ee5b3046c..2e7e8e7600fdb 100644
--- a/libc/benchmarks/gpu/BenchmarkLogger.cpp
+++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp
@@ -91,7 +91,7 @@ template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>);
 
 // TODO: Add floating point formatting once it's supported by StringStream.
 
-BenchmarkLogger blog;
+BenchmarkLogger log;
 
 } // namespace benchmarks
 } // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h
index 98813b28eaa91..332ff1439e6f5 100644
--- a/libc/benchmarks/gpu/BenchmarkLogger.h
+++ b/libc/benchmarks/gpu/BenchmarkLogger.h
@@ -19,7 +19,7 @@ struct BenchmarkLogger {
 };
 
 // A global TestLogger instance to be used in tests.
-extern BenchmarkLogger blog;
+extern BenchmarkLogger log;
 
 } // namespace benchmarks
 } // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 08e99dadc8d07..2a6fcd5ea2556 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -98,12 +98,12 @@ class Benchmark {
     auto result = benchmark(options, func);
     constexpr auto GREEN = "\033[32m";
     constexpr auto RESET = "\033[0m";
-    blog << GREEN << "[ RUN      ] " << RESET << name << '\n';
-    blog << GREEN << "[       OK ] " << RESET << name << ": " << result.cycles
-         << " cycles, " << result.min << " min, " << result.max << " max, "
-         << result.total_iterations << " iterations, " << result.total_time
-         << " ns, " << static_cast<long>(result.standard_deviation)
-         << " stddev\n";
+    log << GREEN << "[ RUN      ] " << RESET << name << '\n';
+    log << GREEN << "[       OK ] " << RESET << name << ": " << result.cycles
+        << " cycles, " << result.min << " min, " << result.max << " max, "
+        << result.total_iterations << " iterations, " << result.total_time
+        << " ns, " << static_cast<long>(result.standard_deviation)
+        << " stddev\n";
   }
   const cpp::string_view get_name() const { return name; }
 };

>From 46b5e25304e48e896297790ce17c2ac93db5a4b2 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 29 May 2024 23:15:48 -0400
Subject: [PATCH 14/18] Revert "add forward iterator"

This reverts commit a5ebf57f198cd79be132854b036f904c3983341d.
---
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp |  4 +-
 libc/src/__support/CPP/array.h           | 26 ++++-----
 libc/src/__support/CPP/iterator.h        | 68 ------------------------
 libc/src/__support/fixedvector.h         |  4 --
 4 files changed, 12 insertions(+), 90 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index a7a02cacc3305..e4f839e361dd0 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -13,8 +13,8 @@ void Benchmark::add_benchmark(Benchmark *benchmark) {
 }
 
 void Benchmark::run_benchmarks() {
-  for (Benchmark *benchmark : benchmarks)
-    benchmark->run();
+  for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it)
+    (*it)->run();
 }
 
 BenchmarkResult benchmark(const BenchmarkOptions &options,
diff --git a/libc/src/__support/CPP/array.h b/libc/src/__support/CPP/array.h
index 7e4cf29847daf..4e69ba003e800 100644
--- a/libc/src/__support/CPP/array.h
+++ b/libc/src/__support/CPP/array.h
@@ -22,12 +22,10 @@ template <class T, size_t N> struct array {
 
   T Data[N];
   using value_type = T;
-  using pointer_type = T *;
-  using iterator = cpp::iterator<pointer_type>;
-  using const_pointer_type = const T *;
-  using const_iterator = cpp::iterator<const_pointer_type>;
-  using reverse_iterator = cpp::reverse_iterator<pointer_type>;
-  using const_reverse_iterator = cpp::reverse_iterator<const_pointer_type>;
+  using iterator = T *;
+  using const_iterator = const T *;
+  using reverse_iterator = cpp::reverse_iterator<iterator>;
+  using const_reverse_iterator = cpp::reverse_iterator<const_iterator>;
 
   LIBC_INLINE constexpr T *data() { return Data; }
   LIBC_INLINE constexpr const T *data() const { return Data; }
@@ -48,16 +46,12 @@ template <class T, size_t N> struct array {
 
   LIBC_INLINE constexpr bool empty() const { return N == 0; }
 
-  LIBC_INLINE constexpr iterator begin() { return iterator{Data}; }
-  LIBC_INLINE constexpr const_iterator begin() const {
-    return const_iterator{Data};
-  }
+  LIBC_INLINE constexpr iterator begin() { return Data; }
+  LIBC_INLINE constexpr const_iterator begin() const { return Data; }
   LIBC_INLINE constexpr const_iterator cbegin() const { return begin(); }
 
-  LIBC_INLINE constexpr iterator end() { return iterator{Data + N}; }
-  LIBC_INLINE constexpr const_iterator end() const {
-    return const_iterator{Data + N};
-  }
+  LIBC_INLINE constexpr iterator end() { return Data + N; }
+  LIBC_INLINE constexpr const_iterator end() const { return Data + N; }
   LIBC_INLINE constexpr const_iterator cend() const { return end(); }
 
   LIBC_INLINE constexpr reverse_iterator rbegin() {
@@ -71,10 +65,10 @@ template <class T, size_t N> struct array {
   }
 
   LIBC_INLINE constexpr reverse_iterator rend() {
-    return reverse_iterator{Data};
+    return reverse_iterator{begin()};
   }
   LIBC_INLINE constexpr const_reverse_iterator rend() const {
-    return const_reverse_iterator{Data};
+    return const_reverse_iterator{begin()};
   }
   LIBC_INLINE constexpr const_reverse_iterator crend() const { return rend(); }
 };
diff --git a/libc/src/__support/CPP/iterator.h b/libc/src/__support/CPP/iterator.h
index 37d631b01582e..b0fd5c9f22ae0 100644
--- a/libc/src/__support/CPP/iterator.h
+++ b/libc/src/__support/CPP/iterator.h
@@ -92,74 +92,6 @@ template <typename Iter> class reverse_iterator {
   }
 };
 
-template <typename Iter> class iterator {
-  Iter current;
-
-public:
-  using reference = typename iterator_traits<Iter>::reference;
-  using value_type = typename iterator_traits<Iter>::value_type;
-  using iterator_type = Iter;
-
-  LIBC_INLINE iterator() : current() {}
-  LIBC_INLINE constexpr explicit iterator(Iter it) : current(it) {}
-
-  template <typename Other,
-            cpp::enable_if_t<!cpp::is_same_v<Iter, Other> &&
-                                 cpp::is_convertible_v<const Other &, Iter>,
-                             int> = 0>
-  LIBC_INLINE constexpr explicit iterator(const Other &it) : current(it) {}
-
-  LIBC_INLINE friend constexpr bool operator==(const iterator &lhs,
-                                               const iterator &rhs) {
-    return lhs.base() == rhs.base();
-  }
-
-  LIBC_INLINE friend constexpr bool operator!=(const iterator &lhs,
-                                               const iterator &rhs) {
-    return lhs.base() != rhs.base();
-  }
-
-  LIBC_INLINE friend constexpr bool operator<(const iterator &lhs,
-                                              const iterator &rhs) {
-    return lhs.base() < rhs.base();
-  }
-
-  LIBC_INLINE friend constexpr bool operator<=(const iterator &lhs,
-                                               const iterator &rhs) {
-    return lhs.base() <= rhs.base();
-  }
-
-  LIBC_INLINE friend constexpr bool operator>(const iterator &lhs,
-                                              const iterator &rhs) {
-    return lhs.base() > rhs.base();
-  }
-
-  LIBC_INLINE friend constexpr bool operator>=(const iterator &lhs,
-                                               const iterator &rhs) {
-    return lhs.base() >= rhs.base();
-  }
-
-  LIBC_INLINE constexpr iterator_type base() const { return current; }
-
-  LIBC_INLINE constexpr reference operator*() const {
-    Iter tmp = current;
-    return *tmp;
-  }
-  LIBC_INLINE constexpr iterator operator--() {
-    --current;
-    return *this;
-  }
-  LIBC_INLINE constexpr iterator &operator++() {
-    ++current;
-    return *this;
-  }
-  LIBC_INLINE constexpr iterator operator++(int) {
-    iterator tmp(*this);
-    ++current;
-    return tmp;
-  }
-};
-
 } // namespace cpp
 } // namespace LIBC_NAMESPACE
 
diff --git a/libc/src/__support/fixedvector.h b/libc/src/__support/fixedvector.h
index ef00fede07366..403b1620d20df 100644
--- a/libc/src/__support/fixedvector.h
+++ b/libc/src/__support/fixedvector.h
@@ -82,10 +82,6 @@ template <typename T, size_t CAPACITY> class FixedVector {
   // can easily swap one data structure for the other.
   static void destroy(FixedVector<T, CAPACITY> *store) { store->reset(); }
 
-  using iterator = typename cpp::array<T, CAPACITY>::iterator;
-  LIBC_INLINE constexpr iterator begin() { return iterator{&store[0]}; }
-  LIBC_INLINE constexpr iterator end() { return iterator{&store[item_count]}; }
-
   using reverse_iterator = typename cpp::array<T, CAPACITY>::reverse_iterator;
   LIBC_INLINE constexpr reverse_iterator rbegin() {
     return reverse_iterator{&store[item_count]};

>From 945090f8cc726be9560411d219421f2d7e5da775 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 15 Jun 2024 12:15:31 -0400
Subject: [PATCH 15/18] support multithreaded benchmarks

---
 libc/benchmarks/gpu/CMakeLists.txt         |  1 +
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp   | 60 +++++++++++++++++++++-
 libc/benchmarks/gpu/LibcGpuBenchmark.h     | 13 +----
 libc/benchmarks/gpu/timing/nvptx/timing.h  |  6 ---
 libc/cmake/modules/LLVMLibCTestRules.cmake |  8 ++-
 5 files changed, 67 insertions(+), 21 deletions(-)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 9ed45eedc402e..2814434ccd26c 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -15,6 +15,7 @@ function(add_benchmark benchmark_name)
   endif()
   add_libc_hermetic(
     ${benchmark_name}
+    IS_BENCHMARK
     LINK_LIBRARIES
       LibcGpuBenchmark.hermetic
       ${BENCHMARK_LINK_LIBRARIES}
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index e4f839e361dd0..0776ebf950ddf 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -1,20 +1,76 @@
 #include "LibcGpuBenchmark.h"
 #include "src/__support/CPP/algorithm.h"
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/string.h"
 #include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/GPU/utils.h"
+#include "src/__support/fixedvector.h"
 #include "src/time/gpu/time_utils.h"
 
 namespace LIBC_NAMESPACE {
 namespace benchmarks {
 
 FixedVector<Benchmark *, 64> benchmarks;
+cpp::array<BenchmarkResult, 1024> results;
 
 void Benchmark::add_benchmark(Benchmark *benchmark) {
   benchmarks.push_back(benchmark);
 }
 
+BenchmarkResult reduce_results(cpp::array<BenchmarkResult, 1024> &results) {
+  BenchmarkResult result;
+  uint64_t cycles_sum = 0;
+  double standard_deviation_sum = 0;
+  uint64_t min = UINT64_MAX;
+  uint64_t max = 0;
+  uint32_t samples_sum = 0;
+  uint32_t iterations_sum = 0;
+  clock_t time_sum = 0;
+  uint64_t num_threads = gpu::get_num_threads();
+  for (uint64_t i = 0; i < num_threads; i++) {
+    BenchmarkResult current_result = results[i];
+    cycles_sum += current_result.cycles;
+    standard_deviation_sum += current_result.standard_deviation;
+    min = cpp::min(min, current_result.min);
+    max = cpp::max(max, current_result.max);
+    samples_sum += current_result.samples;
+    iterations_sum += current_result.total_iterations;
+    time_sum += current_result.total_time;
+  }
+  result.cycles = cycles_sum / num_threads;
+  result.standard_deviation = standard_deviation_sum / num_threads;
+  result.min = min;
+  result.max = max;
+  result.samples = samples_sum / num_threads;
+  result.total_iterations = iterations_sum / num_threads;
+  result.total_time = time_sum / num_threads;
+  return result;
+}
+
 void Benchmark::run_benchmarks() {
-  for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it)
-    (*it)->run();
+  uint64_t id = gpu::get_thread_id();
+  gpu::sync_threads();
+
+  for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) {
+    Benchmark *benchmark = *it;
+    results[id] = benchmark->run();
+  }
+  gpu::sync_threads();
+  if (id == 0) {
+    for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) {
+      Benchmark *benchmark = *it;
+      BenchmarkResult all_results = reduce_results(results);
+      constexpr auto GREEN = "\033[32m";
+      constexpr auto RESET = "\033[0m";
+      log << GREEN << "[ RUN      ] " << RESET << benchmark->get_name() << '\n';
+      log << GREEN << "[       OK ] " << RESET << benchmark->get_name() << ": "
+          << all_results.cycles << " cycles, " << all_results.min << " min, "
+          << all_results.max << " max, " << all_results.total_iterations
+          << " iterations, " << all_results.total_time << " ns, "
+          << static_cast<long>(all_results.standard_deviation) << " stddev\n";
+    }
+  }
+  gpu::sync_threads();
 }
 
 BenchmarkResult benchmark(const BenchmarkOptions &options,
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 2a6fcd5ea2556..59dd589462080 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -6,7 +6,6 @@
 #include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/fixedvector.h"
 #include "src/time/clock.h"
 
 #include <stdint.h>
@@ -93,17 +92,9 @@ class Benchmark {
   static void add_benchmark(Benchmark *benchmark);
 
 private:
-  void run() {
+  BenchmarkResult run() {
     BenchmarkOptions options;
-    auto result = benchmark(options, func);
-    constexpr auto GREEN = "\033[32m";
-    constexpr auto RESET = "\033[0m";
-    log << GREEN << "[ RUN      ] " << RESET << name << '\n';
-    log << GREEN << "[       OK ] " << RESET << name << ": " << result.cycles
-        << " cycles, " << result.min << " min, " << result.max << " max, "
-        << result.total_iterations << " iterations, " << result.total_time
-        << " ns, " << static_cast<long>(result.standard_deviation)
-        << " stddev\n";
+    return benchmark(options, func);
   }
   const cpp::string_view get_name() const { return name; }
 };
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 001bdd3686062..5c45425706f11 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -24,13 +24,11 @@ namespace LIBC_NAMESPACE {
 [[gnu::noinline]] static uint64_t overhead() {
   volatile uint32_t x = 1;
   uint32_t y = x;
-  gpu::sync_threads();
   uint64_t start = gpu::processor_clock();
   asm volatile("" ::"r"(y), "llr"(start));
   uint32_t result = y;
   asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
   uint64_t stop = gpu::processor_clock();
-  gpu::sync_threads();
   volatile auto storage = result;
   return stop - start;
 }
@@ -47,7 +45,6 @@ template <typename F, typename T>
   asm volatile("" ::"r"(arg));
 
   // Get the current timestamp from the clock.
-  gpu::sync_threads();
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
@@ -66,7 +63,6 @@ template <typename F, typename T>
   // ordering.
   uint64_t stop = gpu::processor_clock();
   gpu::memory_fence();
-  gpu::sync_threads();
   asm volatile("" ::"r"(stop));
   volatile T output = result;
 
@@ -82,7 +78,6 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   T2 arg2 = storage2;
   asm volatile("" ::"r"(arg), "r"(arg2));
 
-  gpu::sync_threads();
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
@@ -94,7 +89,6 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
 
   uint64_t stop = gpu::processor_clock();
   gpu::memory_fence();
-  gpu::sync_threads();
   asm volatile("" ::"r"(stop));
   volatile auto output = result;
 
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 508694ae9fc01..fbeec32883b63 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -553,7 +553,7 @@ function(add_libc_hermetic test_name)
   endif()
   cmake_parse_arguments(
     "HERMETIC_TEST"
-    "" # No optional arguments
+    "IS_BENCHMARK" # Optional arguments
     "SUITE" # Single value arguments
     "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
     ${ARGN}
@@ -716,7 +716,11 @@ function(add_libc_hermetic test_name)
   )
 
   add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name})
-  add_dependencies(libc-hermetic-tests ${fq_target_name})
+  if(NOT ${HERMETIC_TEST_IS_BENCHMARK})
+    # If it is a benchmark, it will already have been added to the
+    # gpu-benchmark target
+    add_dependencies(libc-hermetic-tests ${fq_target_name})
+  endif()
 endfunction(add_libc_hermetic)
 
 # A convenience function to add both a unit test as well as a hermetic test.

>From 4aa5e8bc05d814e67a332f578ed2893230e90dd7 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 15 Jun 2024 14:26:17 -0400
Subject: [PATCH 16/18] use for each syntax

---
 libc/benchmarks/gpu/CMakeLists.txt       |  2 +-
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 13 +++++--------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 2814434ccd26c..4d2a3a4ac66d3 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -47,7 +47,7 @@ add_unittest_framework_library(
     libc.src.__support.macros.properties.types
     libc.src.__support.OSUtil.osutil
     libc.src.__support.uint128
-    libc.src.___support.FPUtil.sqrt
+    libc.src.__support.FPUtil.sqrt
     libc.src.__support.fixedvector
     libc.src.time.clock
     libc.benchmarks.gpu.timing.timing
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 0776ebf950ddf..69adb0c95ba76 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -51,14 +51,11 @@ void Benchmark::run_benchmarks() {
   uint64_t id = gpu::get_thread_id();
   gpu::sync_threads();
 
-  for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) {
-    Benchmark *benchmark = *it;
+  for (Benchmark *benchmark : benchmarks)
     results[id] = benchmark->run();
-  }
   gpu::sync_threads();
   if (id == 0) {
-    for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) {
-      Benchmark *benchmark = *it;
+    for (Benchmark *benchmark : benchmarks) {
       BenchmarkResult all_results = reduce_results(results);
       constexpr auto GREEN = "\033[32m";
       constexpr auto RESET = "\033[0m";
@@ -128,9 +125,9 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
     iterations *= options.scaling_factor;
   }
   result.cycles = best_guess;
-  result.standard_deviation =
-      fputil::sqrt(static_cast<double>(cycles_squared) / total_iterations -
-                   (best_guess * best_guess));
+  result.standard_deviation = fputil::sqrt<double>(
+      static_cast<double>(cycles_squared) / total_iterations -
+      static_cast<double>(best_guess * best_guess));
   result.min = min;
   result.max = max;
   result.samples = samples;

>From b93318e2aaec5fc4ed6da0e047e59e88c3251894 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 15 Jun 2024 17:53:22 -0400
Subject: [PATCH 17/18] switch LINK_LIBRARIES argument from optional to
 multi-value to fix build issue

---
 libc/benchmarks/gpu/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 4d2a3a4ac66d3..d167abcaf2db1 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -5,9 +5,9 @@ add_custom_target(gpu-benchmark)
 function(add_benchmark benchmark_name)
   cmake_parse_arguments(
     "BENCHMARK"
-    "LINK_LIBRARIES" # Optional arguments
+    "" # Optional arguments
     "" # Single value arguments
-    "" # Multi-value arguments
+    "LINK_LIBRARIES" # Multi-value arguments
     ${ARGN}
   )
   if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)

>From cb3b05c5d6f9d25d5dbdabc70053039f00165511 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Thu, 20 Jun 2024 22:48:11 -0400
Subject: [PATCH 18/18] clean up

---
 libc/benchmarks/gpu/src/CMakeLists.txt      |  1 -
 libc/benchmarks/gpu/src/math/CMakeLists.txt |  0
 libc/benchmarks/gpu/timing/nvptx/timing.h   | 20 ++++++++++----------
 3 files changed, 10 insertions(+), 11 deletions(-)
 delete mode 100644 libc/benchmarks/gpu/src/math/CMakeLists.txt

diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt
index f15d082e4dd2b..42eb4f7b5909a 100644
--- a/libc/benchmarks/gpu/src/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/CMakeLists.txt
@@ -1,2 +1 @@
 add_subdirectory(ctype)
-add_subdirectory(math)
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 5c45425706f11..d3851a764c43d 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -25,9 +25,9 @@ namespace LIBC_NAMESPACE {
   volatile uint32_t x = 1;
   uint32_t y = x;
   uint64_t start = gpu::processor_clock();
-  asm volatile("" ::"r"(y), "llr"(start));
+  asm("" ::"r"(y), "llr"(start));
   uint32_t result = y;
-  asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
   uint64_t stop = gpu::processor_clock();
   volatile auto storage = result;
   return stop - start;
@@ -42,7 +42,7 @@ template <typename F, typename T>
   // not constant propagate it and remove the profiling region.
   volatile T storage = t;
   T arg = storage;
-  asm volatile("" ::"r"(arg));
+  asm("" ::"r"(arg));
 
   // Get the current timestamp from the clock.
   gpu::memory_fence();
@@ -50,20 +50,20 @@ template <typename F, typename T>
 
   // This forces the compiler to load the input argument and run the clock cycle
   // counter before the profiling region.
-  asm volatile("" ::"r"(arg), "llr"(start));
+  asm("" ::"r"(arg), "llr"(start));
 
   // Run the function under test and return its value.
   auto result = f(arg);
 
   // This inline assembly performs a no-op which forces the result to both be
   // used and prevents us from exiting this region before it's complete.
-  asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
 
   // Obtain the current timestamp after running the calculation and force
   // ordering.
   uint64_t stop = gpu::processor_clock();
   gpu::memory_fence();
-  asm volatile("" ::"r"(stop));
+  asm("" ::"r"(stop));
   volatile T output = result;
 
   // Return the time elapsed.
@@ -76,20 +76,20 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   volatile T2 storage2 = t2;
   T1 arg = storage;
   T2 arg2 = storage2;
-  asm volatile("" ::"r"(arg), "r"(arg2));
+  asm("" ::"r"(arg), "r"(arg2));
 
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
-  asm volatile("" ::"r"(arg), "r"(arg2), "llr"(start));
+  asm("" ::"r"(arg), "r"(arg2), "llr"(start));
 
   auto result = f(arg, arg2);
 
-  asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
 
   uint64_t stop = gpu::processor_clock();
   gpu::memory_fence();
-  asm volatile("" ::"r"(stop));
+  asm("" ::"r"(stop));
   volatile auto output = result;
 
   return stop - start;



More information about the libc-commits mailing list