[libc-commits] [libc] [libc] NVPTX Profiling Draft (PR #92009)

Mon May 13 11:46:48 PDT 2024

https://github.com/jameshu15869 created https://github.com/llvm/llvm-project/pull/92009

Draft PR for adding microbenchmarking infrastructure for NVPTX. `nvlink` cannot perform LTO, so we cannot inline `libc` functions and this function call overhead is not adjusted for during microbenchmarking. 

>From 929819c8619d98c49ff6b7295faad7d83f1a956f Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 11 May 2024 17:19:54 -0400
Subject: [PATCH 1/3] initial nvptx microbenchmarking infrastructure

---
 libc/CMakeLists.txt                           |   2 +
 libc/benchmarks/CMakeLists.txt                | 412 +++++++++---------
 libc/benchmarks/gpu/CMakeLists.txt            | 203 +++++++++
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp      |  75 ++++
 libc/benchmarks/gpu/LibcGpuBenchmark.h        | 282 ++++++++++++
 libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp  |   6 +
 libc/benchmarks/gpu/TestLogger.cpp            |  89 ++++
 libc/benchmarks/gpu/TestLogger.h              |  27 ++
 libc/benchmarks/gpu/dummy.cpp                 |  42 ++
 libc/benchmarks/gpu/src/CMakeLists.txt        |   2 +
 libc/benchmarks/gpu/src/ctype/CMakeLists.txt  |  21 +
 .../gpu/src/ctype/isalnum_benchmark.cpp       |  24 +
 .../gpu/src/ctype/isalpha_benchmark.cpp       |   9 +
 libc/benchmarks/gpu/src/math/CMakeLists.txt   |   0
 libc/benchmarks/gpu/timing/CMakeLists.txt     |  20 +
 .../gpu/timing/nvptx/CMakeLists.txt           |   8 +
 libc/benchmarks/gpu/timing/nvptx/timing.h     | 175 ++++++++
 libc/benchmarks/gpu/timing/timing.h           |  22 +
 18 files changed, 1216 insertions(+), 203 deletions(-)
 create mode 100644 libc/benchmarks/gpu/CMakeLists.txt
 create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmark.cpp
 create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmark.h
 create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
 create mode 100644 libc/benchmarks/gpu/TestLogger.cpp
 create mode 100644 libc/benchmarks/gpu/TestLogger.h
 create mode 100644 libc/benchmarks/gpu/dummy.cpp
 create mode 100644 libc/benchmarks/gpu/src/CMakeLists.txt
 create mode 100644 libc/benchmarks/gpu/src/ctype/CMakeLists.txt
 create mode 100644 libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
 create mode 100644 libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
 create mode 100644 libc/benchmarks/gpu/src/math/CMakeLists.txt
 create mode 100644 libc/benchmarks/gpu/timing/CMakeLists.txt
 create mode 100644 libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
 create mode 100644 libc/benchmarks/gpu/timing/nvptx/timing.h
 create mode 100644 libc/benchmarks/gpu/timing/timing.h

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 175efd89d67e6..ba6eadd207a4f 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -400,7 +400,9 @@ if(LLVM_INCLUDE_TESTS)
   add_subdirectory(fuzzing)
 endif()
 
+message(STATUS "Checking on variable: ${LIBC_INCLUDE_BENCHMARKS}")
 if(LIBC_INCLUDE_BENCHMARKS)
+	message(STATUS "including libc benchmarks")
   add_subdirectory(benchmarks)
 endif()
 
diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 4978da65850cc..1fb7026d79359 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -1,205 +1,211 @@
-find_package(Threads)
-
-set(LLVM_LINK_COMPONENTS
-  Support
-  TargetParser
-  )
-
-#==============================================================================
-# Add Unit Testing Support
-#==============================================================================
-
-function(add_libc_benchmark_unittest target_name)
-  if(NOT LLVM_INCLUDE_TESTS)
-    return()
-  endif()
-
-  cmake_parse_arguments(
-    "LIBC_BENCHMARKS_UNITTEST"
-    "" # No optional arguments
-    "SUITE" # Single value arguments
-    "SRCS;DEPENDS" # Multi-value arguments
-    ${ARGN}
-  )
-
-  add_executable(${target_name}
-    EXCLUDE_FROM_ALL
-    ${LIBC_BENCHMARKS_UNITTEST_SRCS}
-  )
-  target_link_libraries(${target_name}
-    PRIVATE
-    llvm_gtest_main
-    llvm_gtest
-    ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
-  )
-  llvm_update_compile_flags(${target_name})
-
-  add_custom_command(
-    TARGET ${target_name}
-    POST_BUILD
-    COMMAND $<TARGET_FILE:${target_name}>
-  )
-  add_dependencies(libc-benchmark-util-tests ${target_name})
-endfunction()
-
-#==============================================================================
-# Build Google Benchmark for libc
-#==============================================================================
-
-include(ExternalProject)
-ExternalProject_Add(google-benchmark-libc
-        EXCLUDE_FROM_ALL ON
-        PREFIX google-benchmark-libc
-        SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
-        INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
-        CMAKE_CACHE_ARGS
-          -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
-          -DBENCHMARK_ENABLE_LTO:BOOL=OFF
-          -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
-          -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
-          -DBENCHMARK_FORCE_WERROR:BOOL=OFF
-          -DBENCHMARK_USE_LIBCXX:BOOL=OFF
-          -DCMAKE_BUILD_TYPE:STRING=Release
-
-          -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
-          -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
-          -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
-          -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
-          -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
-          -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
-
-          -DBUILD_SHARED_LIBS:BOOL=OFF
-          -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
-
-          -DCMAKE_CXX_STANDARD:STRING=14
-          -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
-        )
-
-add_custom_target(libc-benchmark-util-tests)
-
-# libc-benchmark
-add_library(libc-benchmark
-    STATIC
-    EXCLUDE_FROM_ALL
-    LibcBenchmark.cpp
-    LibcBenchmark.h
-)
-
-target_include_directories(libc-benchmark
-    PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
-)
-target_link_libraries(libc-benchmark
-    PUBLIC
+if(NOT LIBC_TARGET_OS_IS_GPU)
+	find_package(Threads)
+
+	set(LLVM_LINK_COMPONENTS
+	  Support
+	  TargetParser
+	  )
+
+	#==============================================================================
+	# Add Unit Testing Support
+	#==============================================================================
+
+	function(add_libc_benchmark_unittest target_name)
+	  if(NOT LLVM_INCLUDE_TESTS)
+	    return()
+	  endif()
+
+	  cmake_parse_arguments(
+	    "LIBC_BENCHMARKS_UNITTEST"
+	    "" # No optional arguments
+	    "SUITE" # Single value arguments
+	    "SRCS;DEPENDS" # Multi-value arguments
+	    ${ARGN}
+	  )
+
+	  add_executable(${target_name}
+	    EXCLUDE_FROM_ALL
+	    ${LIBC_BENCHMARKS_UNITTEST_SRCS}
+	  )
+	  target_link_libraries(${target_name}
+	    PRIVATE
+	    llvm_gtest_main
+	    llvm_gtest
+	    ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
+	  )
+	  llvm_update_compile_flags(${target_name})
+
+	  add_custom_command(
+	    TARGET ${target_name}
+	    POST_BUILD
+	    COMMAND $<TARGET_FILE:${target_name}>
+	  )
+	  add_dependencies(libc-benchmark-util-tests ${target_name})
+	endfunction()
+
+	#==============================================================================
+	# Build Google Benchmark for libc
+	#==============================================================================
+
+	include(ExternalProject)
+	ExternalProject_Add(google-benchmark-libc
+		EXCLUDE_FROM_ALL ON
+		PREFIX google-benchmark-libc
+		SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
+		INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
+		CMAKE_CACHE_ARGS
+		  -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
+		  -DBENCHMARK_ENABLE_LTO:BOOL=OFF
+		  -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
+		  -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
+		  -DBENCHMARK_FORCE_WERROR:BOOL=OFF
+		  -DBENCHMARK_USE_LIBCXX:BOOL=OFF
+		  -DCMAKE_BUILD_TYPE:STRING=Release
+
+		  -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
+		  -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
+		  -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
+		  -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
+		  -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
+		  -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
+
+		  -DBUILD_SHARED_LIBS:BOOL=OFF
+		  -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
+
+		  -DCMAKE_CXX_STANDARD:STRING=14
+		  -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
+		)
+
+	add_custom_target(libc-benchmark-util-tests)
+
+	# libc-benchmark
+	add_library(libc-benchmark
+	    STATIC
+	    EXCLUDE_FROM_ALL
+	    LibcBenchmark.cpp
+	    LibcBenchmark.h
+	)
+
+	target_include_directories(libc-benchmark
+	    PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
+	)
+	target_link_libraries(libc-benchmark
+	    PUBLIC
     benchmark::benchmark
-    LLVMSupport
-    LLVMTargetParser
+	    LLVMSupport
+	    LLVMTargetParser
     Threads::Threads
-)
-add_dependencies(libc-benchmark google-benchmark-libc)
-llvm_update_compile_flags(libc-benchmark)
-
-add_libc_benchmark_unittest(libc-benchmark-test
-    SRCS LibcBenchmarkTest.cpp
-    DEPENDS libc-benchmark
-)
-
-# libc-memory-benchmark
-add_library(libc-memory-benchmark
-    STATIC
-    EXCLUDE_FROM_ALL
-    LibcMemoryBenchmark.cpp
-    LibcMemoryBenchmark.h
-    LibcFunctionPrototypes.h
-    MemorySizeDistributions.cpp
-    MemorySizeDistributions.h
-)
-target_include_directories(libc-memory-benchmark
-    PUBLIC
-    ${CMAKE_CURRENT_SOURCE_DIR}
-)
-target_link_libraries(libc-memory-benchmark
-    PUBLIC
-    libc-benchmark
-)
-llvm_update_compile_flags(libc-memory-benchmark)
-
-add_libc_benchmark_unittest(libc-memory-benchmark-test
-    SRCS LibcMemoryBenchmarkTest.cpp
-    DEPENDS libc-memory-benchmark
-)
-
-# json
-add_library(json
-    STATIC
-    EXCLUDE_FROM_ALL
-    JSON.cpp
-    JSON.h
-)
-target_link_libraries(json PUBLIC libc-memory-benchmark)
-llvm_update_compile_flags(json)
-
-add_libc_benchmark_unittest(json-test
-    SRCS JSONTest.cpp
-    DEPENDS json
-)
-
-#==============================================================================
-# Benchmarking tool
-#==============================================================================
-
-# Benchmark all implementations that can run on the target CPU.
-function(add_libc_multi_impl_benchmark name)
-  get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
-  foreach(fq_config_name IN LISTS fq_implementations)
-    get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
-    cpu_supports(can_run "${required_cpu_features}")
-    if(can_run)
-        set(benchmark_name ${fq_config_name}_benchmark)
-        add_executable(${benchmark_name}
-            EXCLUDE_FROM_ALL
-            LibcMemoryBenchmarkMain.cpp
-        )
-        get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
-        target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
-        string(TOUPPER ${name} name_upper)
-        target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
-        llvm_update_compile_flags(${benchmark_name})
-    else()
-      message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
-    endif()
-  endforeach()
-endfunction()
-
-add_libc_multi_impl_benchmark(bcmp)
-add_libc_multi_impl_benchmark(bzero)
-add_libc_multi_impl_benchmark(memcmp)
-add_libc_multi_impl_benchmark(memcpy)
-add_libc_multi_impl_benchmark(memmove)
-add_libc_multi_impl_benchmark(memset)
-
-#==============================================================================
-# Google Benchmarking tool
-#==============================================================================
-
-# This target uses the Google Benchmark facility to report throughput for llvm
-# libc memory functions compiled for the host machine. This is useful to
-# continuously monitor the performance of the memory functions.
-add_executable(libc.benchmarks.memory_functions.opt_host
-  EXCLUDE_FROM_ALL
-  LibcMemoryGoogleBenchmarkMain.cpp
-  LibcDefaultImplementations.cpp
-)
-target_link_libraries(libc.benchmarks.memory_functions.opt_host
-  PRIVATE
-  libc-memory-benchmark
-  libc.src.string.memcmp_opt_host.__internal__
-  libc.src.string.bcmp_opt_host.__internal__
-  libc.src.string.memcpy_opt_host.__internal__
-  libc.src.string.memset_opt_host.__internal__
-  libc.src.string.bzero_opt_host.__internal__
-  libc.src.string.memmove_opt_host.__internal__
-  benchmark_main
-)
-llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host)
-
-add_subdirectory(automemcpy)
+	)
+	add_dependencies(libc-benchmark google-benchmark-libc)
+	llvm_update_compile_flags(libc-benchmark)
+
+	add_libc_benchmark_unittest(libc-benchmark-test
+	    SRCS LibcBenchmarkTest.cpp
+	    DEPENDS libc-benchmark
+	)
+
+	# libc-memory-benchmark
+	add_library(libc-memory-benchmark
+	    STATIC
+	    EXCLUDE_FROM_ALL
+	    LibcMemoryBenchmark.cpp
+	    LibcMemoryBenchmark.h
+	    LibcFunctionPrototypes.h
+	    MemorySizeDistributions.cpp
+	    MemorySizeDistributions.h
+	)
+	target_include_directories(libc-memory-benchmark
+	    PUBLIC
+	    ${CMAKE_CURRENT_SOURCE_DIR}
+	)
+	target_link_libraries(libc-memory-benchmark
+	    PUBLIC
+	    libc-benchmark
+	)
+	llvm_update_compile_flags(libc-memory-benchmark)
+
+	add_libc_benchmark_unittest(libc-memory-benchmark-test
+	    SRCS LibcMemoryBenchmarkTest.cpp
+	    DEPENDS libc-memory-benchmark
+	)
+
+	# json
+	add_library(json
+	    STATIC
+	    EXCLUDE_FROM_ALL
+	    JSON.cpp
+	    JSON.h
+	)
+	target_link_libraries(json PUBLIC libc-memory-benchmark)
+	llvm_update_compile_flags(json)
+
+	add_libc_benchmark_unittest(json-test
+	    SRCS JSONTest.cpp
+	    DEPENDS json
+	)
+
+	#==============================================================================
+	# Benchmarking tool
+	#==============================================================================
+
+	# Benchmark all implementations that can run on the target CPU.
+	function(add_libc_multi_impl_benchmark name)
+	  get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
+	  foreach(fq_config_name IN LISTS fq_implementations)
+	    get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
+	    cpu_supports(can_run "${required_cpu_features}")
+	    if(can_run)
+		set(benchmark_name ${fq_config_name}_benchmark)
+		add_executable(${benchmark_name}
+		    EXCLUDE_FROM_ALL
+		    LibcMemoryBenchmarkMain.cpp
+		)
+		get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
+		target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
+		string(TOUPPER ${name} name_upper)
+		target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
+		llvm_update_compile_flags(${benchmark_name})
+	    else()
+	      message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
+	    endif()
+	  endforeach()
+	endfunction()
+
+	add_libc_multi_impl_benchmark(bcmp)
+	add_libc_multi_impl_benchmark(bzero)
+	add_libc_multi_impl_benchmark(memcmp)
+	add_libc_multi_impl_benchmark(memcpy)
+	add_libc_multi_impl_benchmark(memmove)
+	add_libc_multi_impl_benchmark(memset)
+
+	#==============================================================================
+	# Google Benchmarking tool
+	#==============================================================================
+
+	# This target uses the Google Benchmark facility to report throughput for llvm
+	# libc memory functions compiled for the host machine. This is useful to
+	# continuously monitor the performance of the memory functions.
+	add_executable(libc.benchmarks.memory_functions.opt_host
+	  EXCLUDE_FROM_ALL
+	  LibcMemoryGoogleBenchmarkMain.cpp
+	  LibcDefaultImplementations.cpp
+	)
+	target_link_libraries(libc.benchmarks.memory_functions.opt_host
+	  PRIVATE
+	  libc-memory-benchmark
+	  libc.src.string.memcmp_opt_host.__internal__
+	  libc.src.string.bcmp_opt_host.__internal__
+	  libc.src.string.memcpy_opt_host.__internal__
+	  libc.src.string.memset_opt_host.__internal__
+	  libc.src.string.bzero_opt_host.__internal__
+	  libc.src.string.memmove_opt_host.__internal__
+	  benchmark_main
+	)
+	llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host)
+
+	add_subdirectory(automemcpy)
+endif()
+
+if(LIBC_TARGET_OS_IS_GPU)
+	add_subdirectory(gpu)
+endif()
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
new file mode 100644
index 0000000000000..2f258da8a3297
--- /dev/null
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -0,0 +1,203 @@
+add_subdirectory(timing)
+
+add_custom_target(gpu-benchmark)
+
+function (add_gpu_benchmark test_name)
+  if(NOT TARGET libc.startup.${LIBC_TARGET_OS}.crt1)
+    message(VERBOSE "Skipping ${fq_target_name} as it is not available on ${LIBC_TARGET_OS}.")
+    return()
+  endif()
+
+  cmake_parse_arguments(
+    "GPU_BENCHMARK"
+    "" # No optional arguments
+    "SUITE" # Single value arguments
+    "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
+    ${ARGN}
+  )
+
+  if(NOT GPU_BENCHMARK_SUITE)
+    message(FATAL_ERROR "SUITE not specified for ${fq_target_name}")
+  endif()
+  if(NOT GPU_BENCHMARK_SRCS)
+    message(FATAL_ERROR "The SRCS list for add_gpu_benchmark is missing.")
+  endif()
+
+  get_fq_target_name(${test_name} fq_target_name)
+  get_fq_target_name(${test_name}.libc fq_libc_target_name) # Stores the compiled libc + infrastructure archive to link in
+  get_fq_deps_list(fq_deps_list ${GPU_BENCHMARK_DEPENDS})
+  message(STATUS "Depends: ${fq_deps_list}")
+  list(APPEND fq_deps_list
+      # Hermetic tests use the platform's startup object. So, their deps also
+      # have to be collected.
+      libc.startup.${LIBC_TARGET_OS}.crt1
+      # We always add the memory functions objects. This is because the
+      # compiler's codegen can emit calls to the C memory functions.
+      libc.src.string.bcmp
+      libc.src.string.bzero
+      libc.src.string.memcmp
+      libc.src.string.memcpy
+      libc.src.string.memmove
+      libc.src.string.memset
+      libc.src.__support.StringUtil.error_to_string
+  )
+
+  list(REMOVE_DUPLICATES fq_deps_list)
+
+  # TODO: Instead of gathering internal object files from entrypoints,
+  # collect the object files with public names of entrypoints.
+  get_object_files_for_test(
+      link_object_files skipped_entrypoints_list ${fq_deps_list})
+  if(skipped_entrypoints_list)
+    if(LIBC_CMAKE_VERBOSE_LOGGING)
+      set(msg "Skipping hermetic test ${fq_target_name} as it has missing deps: "
+              "${skipped_entrypoints_list}.")
+      message(STATUS ${msg})
+    endif()
+    return()
+  endif()
+  list(REMOVE_DUPLICATES link_object_files)
+  message(STATUS ${link_object_files})
+
+  # Make a library of all deps
+  add_library(
+    ${fq_target_name}.__libc__
+    STATIC
+    EXCLUDE_FROM_ALL
+    ${link_object_files}
+  )
+  set_target_properties(${fq_target_name}.__libc__
+      PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  set_target_properties(${fq_target_name}.__libc__
+      PROPERTIES ARCHIVE_OUTPUT_NAME ${fq_target_name}.libc)
+
+  set(fq_build_target_name ${fq_target_name}.__build__)
+  add_executable(
+    ${fq_build_target_name}
+    EXCLUDE_FROM_ALL
+    $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
+    ${GPU_BENCHMARK_SRCS}
+    ${GPU_BENCHMARK_HDRS}
+  )
+  set_target_properties(${fq_build_target_name}
+    PROPERTIES
+      RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+
+  _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
+  target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
+  message(STATUS "adding compile: ${compile_options}")
+  target_compile_options(${fq_build_target_name} PRIVATE ${compile_options} -save-temps)
+
+  set(link_libraries "")
+  foreach(lib in LISTS GPU_BENCHMARK_LINK_LIBRARIES)
+    if(TARGET ${lib}.hermetic)
+      list(APPEND link_libraries ${lib}.hermetic)
+    else()
+      list(APPEND link_libraries ${lib})
+    endif()
+  endforeach()
+
+  message(STATUS "IS nvptx: ${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}")
+
+  if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+    target_link_options(${fq_build_target_name} PRIVATE
+      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -Wno-multi-gpu
+      "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
+      "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
+  elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    # We need to use the internal object versions for NVPTX.
+    set(internal_suffix ".__internal__")
+    target_link_options(${fq_build_target_name} PRIVATE
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
+      "-Wl,--suppress-stack-size-warning"
+      -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
+      "--cuda-path=${LIBC_CUDA_ROOT}")
+    message(STATUS "ARCH: ${LIBC_GPU_TARGET_ARCHITECTURE}")
+  elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
+    target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
+  else()
+    # Older version of gcc does not support `nostdlib++` flag.  We use
+    # `nostdlib` and link against libgcc_s, which cannot be linked statically.
+    target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib)
+    list(APPEND link_libraries ${LIBGCC_S_LOCATION})
+  endif()
+
+  # link libraries for the BUILD target (i.e. to compile the test)
+  target_link_libraries(
+    ${fq_build_target_name}
+    PRIVATE
+      libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
+      ${link_libraries}
+      # LibcTest.hermetic
+      LibcGpuBenchmark.hermetic
+      # LibcHermeticTestSupport.hermetic
+      LibcHermeticTestSupport.hermetic
+      # The NVIDIA 'nvlink' linker does not currently support static libraries.
+      $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
+
+  add_dependencies(${fq_build_target_name}
+    LibcGpuBenchmark.hermetic
+  ${fq_deps_list})
+
+  # Tests on the GPU require an external loader utility to launch the kernel.
+  if(TARGET libc.utils.gpu.loader)
+    add_dependencies(${fq_build_target_name} libc.utils.gpu.loader)
+    get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE")
+  endif()
+
+  set(test_cmd ${GPU_BENCHMARK_ENV}
+      $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${GPU_BENCHMARK_LOADER_ARGS}
+      $<TARGET_FILE:${fq_build_target_name}> ${GPU_BENCHMARK_ARGS})
+  add_custom_target(
+    ${fq_target_name}
+    COMMAND ${test_cmd}
+    COMMAND_EXPAND_LISTS
+    COMMENT "Running GPU benchmark ${fq_target_name}"
+  )
+
+  # Make this benchmark part of its suite
+  add_dependencies(${GPU_BENCHMARK_SUITE} ${fq_target_name})
+  # Remember to make this benchmark part of the umbrella command
+  add_dependencies(gpu-benchmark ${fq_target_name})
+endfunction(add_gpu_benchmark)
+
+add_unittest_framework_library(
+  LibcGpuBenchmark
+  SRCS
+    LibcGpuBenchmark.cpp
+    LibcGpuBenchmarkMain.cpp
+    TestLogger.cpp
+  HDRS
+    LibcGpuBenchmark.h
+    TestLogger.h
+  DEPENDS
+    libc.src.__support.big_int
+    libc.src.__support.c_string
+    libc.src.__support.CPP.string
+    libc.src.__support.CPP.string_view
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.fixed_point.fx_rep
+    libc.src.__support.macros.properties.types
+    libc.src.__support.OSUtil.osutil
+    libc.src.__support.uint128
+    libc.benchmarks.gpu.timing.timing
+)
+
+add_custom_target(dummy-suite)
+
+add_gpu_benchmark(
+  dummy
+  SUITE
+    dummy-suite
+  SRCS
+    dummy.cpp
+  DEPENDS
+    libc.src.stdio.fputs
+    libc.src.stdio.stderr
+)
+
+add_subdirectory(src)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
new file mode 100644
index 0000000000000..46e8f904e0643
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -0,0 +1,75 @@
+
+#include "benchmarks/gpu/timing/timing.h"
+
+#include "src/__support/CPP/string.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/io.h"
+
+#include "LibcGpuBenchmark.h"
+
+namespace LIBC_NAMESPACE {
+namespace libc_gpu_benchmarks {
+
+Test *Test::Start = nullptr;
+Test *Test::End = nullptr;
+
+void Test::addTest(Test *T) {
+  if (End == nullptr) {
+    Start = T;
+    End = T;
+    return;
+  }
+
+  End->Next = T;
+  End = T;
+}
+
+int Test::runTests() {
+  for (Test *T = Start; T != nullptr; T = T->Next) {
+    tlog << T->getName() << "\n";
+    T->Run();
+  }
+
+  return 0;
+}
+
+uint64_t benchmark_wrapper(const BenchmarkOptions &Options,
+                           uint64_t (*WrapperFunc)()) {
+  RuntimeEstimationProgression REP;
+  size_t Iterations = Options.InitialIterations;
+  if (Iterations < (uint32_t)1) {
+    Iterations = 1;
+  }
+  size_t Samples = 0;
+  uint64_t BestGuess = 0;
+  uint64_t TotalCycles = 0;
+  for (;;) {
+    uint64_t SampleCycles = 0;
+    for (uint32_t i = 0; i < Iterations; i++) {
+      auto overhead = LIBC_NAMESPACE::overhead();
+      uint64_t result = WrapperFunc() - overhead;
+      SampleCycles += result;
+    }
+
+    Samples++;
+    TotalCycles += SampleCycles;
+    const double ChangeRatio =
+        REP.ComputeImprovement({Iterations, SampleCycles});
+    BestGuess = REP.CurrentEstimation;
+
+    if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
+      break;
+    } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
+      tlog << "Samples are stable!\n";
+      break;
+    }
+
+    Iterations *= Options.ScalingFactor;
+  }
+  tlog << "Best Guess: " << BestGuess << '\n';
+  tlog << "Samples: " << Samples << '\n';
+  return BestGuess;
+};
+
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
new file mode 100644
index 0000000000000..12695c4d18684
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -0,0 +1,282 @@
+#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
+#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
+
+#include "benchmarks/gpu/timing/timing.h"
+
+#include "benchmarks/gpu/TestLogger.h"
+#include "src/__support/CPP/string.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/io.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+
+namespace libc_gpu_benchmarks {
+
+struct BenchmarkOptions {
+  uint32_t InitialIterations = 1;
+  uint32_t MaxIterations = 10000000;
+  uint32_t MinSamples = 4;
+  uint32_t MaxSamples = 1000;
+  double Epsilon = 0.01;
+  double ScalingFactor = 1.4;
+};
+
+struct Measurement {
+  size_t Iterations = 0;
+  uint64_t ElapsedCycles = 0;
+};
+
+class RefinableRuntimeEstimation {
+  uint64_t TotalCycles = 0;
+  size_t TotalIterations = 0;
+
+public:
+  uint64_t Update(const Measurement &M) {
+    TotalCycles += M.ElapsedCycles;
+    TotalIterations += M.Iterations;
+    return TotalCycles / TotalIterations;
+  }
+};
+
+// Tracks the progression of the runtime estimation
+class RuntimeEstimationProgression {
+  RefinableRuntimeEstimation RRE;
+
+public:
+  uint64_t CurrentEstimation = 0;
+
+  double ComputeImprovement(const Measurement &M) {
+    const uint64_t NewEstimation = RRE.Update(M);
+    double Ratio = ((double)CurrentEstimation / NewEstimation) - 1.0;
+
+    // Get absolute value
+    if (Ratio < 0) {
+      Ratio *= -1;
+    }
+
+    CurrentEstimation = NewEstimation;
+    return Ratio;
+  }
+};
+
+template <typename F, typename... Args>
+uint64_t benchmark(const BenchmarkOptions &Options, F f, Args... args) {
+  RuntimeEstimationProgression REP;
+  size_t Iterations = Options.InitialIterations;
+  if (Iterations < (uint32_t)1) {
+    Iterations = 1;
+  }
+  size_t Samples = 0;
+  uint64_t BestGuess = 0;
+  uint64_t TotalCycles = 0;
+#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
+  // Nvidia cannot perform LTO, so we need to perform
+  // 1 call to "warm up" the function before microbenchmarking
+  uint64_t result = latency(f, args...);
+  tlog << "Running warm-up iteration: " << result << '\n';
+#endif
+  for (;;) {
+    uint64_t SampleCycles = 0;
+    for (uint32_t i = 0; i < Iterations; i++) {
+      uint64_t result = latency(f, args...);
+      SampleCycles += result;
+      tlog << result << '\n';
+    }
+
+    Samples++;
+    TotalCycles += SampleCycles;
+    const double ChangeRatio =
+        REP.ComputeImprovement({Iterations, SampleCycles});
+    BestGuess = REP.CurrentEstimation;
+
+    if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
+      break;
+    } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
+      tlog << "Samples are stable!\n";
+      break;
+    }
+
+    Iterations *= Options.ScalingFactor;
+  }
+  // for (int i = 0; i < 3; i++) {
+  //   uint64_t result = latency(f, args...);
+  //   BestGuess = result;
+  //   write_to_stderr(cpp::to_string(result));
+  //   write_to_stderr(cpp::string_view("\n"));
+  // }
+  tlog << "Best Guess: " << BestGuess << '\n';
+  tlog << "Samples: " << Samples << '\n';
+  return BestGuess;
+};
+
+uint64_t benchmark_wrapper(const BenchmarkOptions &Options,
+                           uint64_t (*WrapperFunc)());
+
+template <typename F, typename Arg>
+uint64_t benchmark_macro(const BenchmarkOptions &Options, F f, Arg arg) {
+  RuntimeEstimationProgression REP;
+  size_t Iterations = Options.InitialIterations;
+  if (Iterations < (uint32_t)1) {
+    Iterations = 1;
+  }
+  size_t Samples = 0;
+  uint64_t BestGuess = 0;
+  uint64_t TotalCycles = 0;
+  for (;;) {
+    uint64_t SampleCycles = 0;
+    for (uint32_t i = 0; i < Iterations; i++) {
+      uint64_t result = 0;
+      SINGLE_INPUT_OUTPUT_LATENCY(f, arg, &result);
+      SampleCycles += result;
+      tlog << "Macro: " << result << '\n';
+    }
+
+    Samples++;
+    TotalCycles += SampleCycles;
+    const double ChangeRatio =
+        REP.ComputeImprovement({Iterations, SampleCycles});
+    BestGuess = REP.CurrentEstimation;
+
+    if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
+      break;
+    } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
+      tlog << "Samples are stable!\n";
+      break;
+    }
+
+    Iterations *= Options.ScalingFactor;
+  }
+  // for (int i = 0; i < 3; i++) {
+  //   uint64_t result = latency(f, args...);
+  //   BestGuess = result;
+  //   write_to_stderr(cpp::to_string(result));
+  //   write_to_stderr(cpp::string_view("\n"));
+  // }
+  tlog << "Macro Best Guess: " << BestGuess << '\n';
+  tlog << "Samples: " << Samples << '\n';
+  return BestGuess;
+};
+
+class Test {
+  Test *Next = nullptr;
+
+public:
+  virtual ~Test() {}
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  static int runTests();
+
+protected:
+  static void addTest(Test *);
+
+private:
+  virtual void Run() = 0;
+  virtual const char *getName() const = 0;
+
+  static Test *Start;
+  static Test *End;
+};
+
+template <typename F> class FunctionBenchmark : public Test {
+  F Func;
+  const char *Name;
+
+public:
+  FunctionBenchmark(F Func, char const *Name) : Func(Func), Name(Name) {
+    addTest(this);
+  }
+
+private:
+  void Run() override {
+    BenchmarkOptions Options;
+    auto latency = benchmark(Options, Func);
+    LIBC_NAMESPACE::libc_gpu_benchmarks::tlog << "FnName: " << Name << '\n';
+    LIBC_NAMESPACE::libc_gpu_benchmarks::tlog << "FnBenchmark: " << latency
+                                              << '\n';
+  }
+  const char *getName() const override { return Name; }
+};
+
+class WrapperBenchmark : public Test {
+  using BenchmarkWrapperFunction = uint64_t (*)();
+  BenchmarkWrapperFunction Func;
+  const char *Name;
+
+public:
+  WrapperBenchmark(BenchmarkWrapperFunction Func, char const *Name)
+      : Func(Func), Name(Name) {
+    addTest(this);
+  }
+
+private:
+  void Run() override {
+    tlog << "Running wrapper: " << Name << '\n';
+    // for (int i = 0; i < 10; i++) {
+    //   auto overhead = LIBC_NAMESPACE::overhead();
+    //   auto result = Func() - overhead;
+    //   tlog << "Result: " << result << '\n';
+    //   tlog << "Overhead: " << overhead << '\n';
+    // }
+    BenchmarkOptions Options;
+    auto latency = benchmark_wrapper(Options, Func);
+    tlog << "FnName: " << Name << '\n';
+    tlog << "FnBenchmark: " << latency << '\n';
+  }
+  const char *getName() const override { return Name; }
+};
+
+} // namespace libc_gpu_benchmarks
+
+} // namespace LIBC_NAMESPACE
+
+// #define BENCHMARK(SuiteName, TestName)                                         \
+//   class SuiteName##_##TestName                                                 \
+//       : public LIBC_NAMESPACE::libc_gpu_benchmarks::Test {                     \
+//   public:                                                                      \
+//     SuiteName##_##TestName() { addTest(this); }                                \
+//     void Run() override;                                                       \
+//     const char *getName() const override { return #SuiteName "." #TestName; }  \
+//   };                                                                           \
+//   SuiteName##_##TestName SuiteName##_##TestName##_Instance;                    \
+//   void SuiteName##_##TestName::Run()
+
+#define BENCHMARK_SINGLE_INPUT_OUTPUT(SuiteName, TestName, Func, Arg)          \
+  class SuiteName##_##TestName                                                 \
+      : public LIBC_NAMESPACE::libc_gpu_benchmarks::Test {                     \
+  public:                                                                      \
+    SuiteName##_##TestName() { addTest(this); }                                \
+    void Run() override;                                                       \
+    const char *getName() const override { return #SuiteName "." #TestName; }  \
+  };                                                                           \
+  SuiteName##_##TestName SuiteName##_##TestName##_Instance;                    \
+  void SuiteName##_##TestName::Run() {                                         \
+    LIBC_NAMESPACE::libc_gpu_benchmarks::BenchmarkOptions Options;             \
+    LIBC_NAMESPACE::libc_gpu_benchmarks::benchmark(Options, &Func, Arg);       \
+  }
+
+#define BENCHMARK_FN(SuiteName, TestName, Func)                                \
+  LIBC_NAMESPACE::libc_gpu_benchmarks::FunctionBenchmark                       \
+      SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
+
+#define BENCHMARK_WRAPPER(SuiteName, TestName, Func)                           \
+  LIBC_NAMESPACE::libc_gpu_benchmarks::WrapperBenchmark                        \
+      SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
+
+#define BENCHMARK_S_I_O_V2(SuiteName, TestName, Func, Arg)                     \
+  class SuiteName##_##TestName                                                 \
+      : public LIBC_NAMESPACE::libc_gpu_benchmarks::Test {                     \
+  public:                                                                      \
+    SuiteName##_##TestName() { addTest(this); }                                \
+    void Run() override;                                                       \
+    const char *getName() const override { return #SuiteName "." #TestName; }  \
+  };                                                                           \
+  SuiteName##_##TestName SuiteName##_##TestName##_Instance;                    \
+  void SuiteName##_##TestName::Run() {                                         \
+    LIBC_NAMESPACE::libc_gpu_benchmarks::BenchmarkOptions Options;             \
+    LIBC_NAMESPACE::libc_gpu_benchmarks::benchmark_macro(Options, &Func, Arg); \
+  }
+
+#endif
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
new file mode 100644
index 0000000000000..e7e4f08f5af68
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
@@ -0,0 +1,6 @@
+#include "LibcGpuBenchmark.h"
+
+extern "C" int main(int argc, char **argv, char **envp) {
+  LIBC_NAMESPACE::libc_gpu_benchmarks::Test::runTests();
+  return 0;
+}
diff --git a/libc/benchmarks/gpu/TestLogger.cpp b/libc/benchmarks/gpu/TestLogger.cpp
new file mode 100644
index 0000000000000..b3a8399a91adb
--- /dev/null
+++ b/libc/benchmarks/gpu/TestLogger.cpp
@@ -0,0 +1,89 @@
+#include "benchmarks/gpu/TestLogger.h"
+#include "src/__support/CPP/string.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/io.h"               // write_to_stderr
+#include "src/__support/big_int.h"                 // is_big_int
+#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
+#include "src/__support/uint128.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+namespace libc_gpu_benchmarks {
+
+// cpp::string_view specialization
+template <>
+TestLogger &TestLogger::operator<< <cpp::string_view>(cpp::string_view str) {
+  LIBC_NAMESPACE::write_to_stderr(str);
+  return *this;
+}
+
+// cpp::string specialization
+template <> TestLogger &TestLogger::operator<< <cpp::string>(cpp::string str) {
+  return *this << static_cast<cpp::string_view>(str);
+}
+
+// const char* specialization
+template <> TestLogger &TestLogger::operator<< <const char *>(const char *str) {
+  return *this << cpp::string_view(str);
+}
+
+// char* specialization
+template <> TestLogger &TestLogger::operator<< <char *>(char *str) {
+  return *this << cpp::string_view(str);
+}
+
+// char specialization
+template <> TestLogger &TestLogger::operator<<(char ch) {
+  return *this << cpp::string_view(&ch, 1);
+}
+
+// bool specialization
+template <> TestLogger &TestLogger::operator<<(bool cond) {
+  return *this << (cond ? "true" : "false");
+}
+
+// void * specialization
+template <> TestLogger &TestLogger::operator<<(void *addr) {
+  return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr));
+}
+
+template <typename T> TestLogger &TestLogger::operator<<(T t) {
+  if constexpr (is_big_int_v<T> ||
+                (cpp::is_integral_v<T> && cpp::is_unsigned_v<T> &&
+                 (sizeof(T) > sizeof(uint64_t)))) {
+    static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt");
+    const IntegerToString<T, radix::Hex::WithPrefix> buffer(t);
+    return *this << buffer.view();
+  } else {
+    return *this << cpp::to_string(t);
+  }
+}
+
+// is_integral specializations
+// char is already specialized to handle character
+template TestLogger &TestLogger::operator<< <short>(short);
+template TestLogger &TestLogger::operator<< <int>(int);
+template TestLogger &TestLogger::operator<< <long>(long);
+template TestLogger &TestLogger::operator<< <long long>(long long);
+template TestLogger &TestLogger::operator<< <unsigned char>(unsigned char);
+template TestLogger &TestLogger::operator<< <unsigned short>(unsigned short);
+template TestLogger &TestLogger::operator<< <unsigned int>(unsigned int);
+template TestLogger &TestLogger::operator<< <unsigned long>(unsigned long);
+template TestLogger &
+    TestLogger::operator<< <unsigned long long>(unsigned long long);
+
+#ifdef LIBC_TYPES_HAS_INT128
+template TestLogger &TestLogger::operator<< <__uint128_t>(__uint128_t);
+#endif // LIBC_TYPES_HAS_INT128
+template TestLogger &TestLogger::operator<< <UInt<128>>(UInt<128>);
+template TestLogger &TestLogger::operator<< <UInt<192>>(UInt<192>);
+template TestLogger &TestLogger::operator<< <UInt<256>>(UInt<256>);
+template TestLogger &TestLogger::operator<< <UInt<320>>(UInt<320>);
+
+// TODO: Add floating point formatting once it's supported by StringStream.
+
+TestLogger tlog;
+
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/TestLogger.h b/libc/benchmarks/gpu/TestLogger.h
new file mode 100644
index 0000000000000..68690ce0c3940
--- /dev/null
+++ b/libc/benchmarks/gpu/TestLogger.h
@@ -0,0 +1,27 @@
+//===-- Utilities to log to standard output during tests --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_UNITTEST_TESTLOGGER_H
+#define LLVM_LIBC_TEST_UNITTEST_TESTLOGGER_H
+
+namespace LIBC_NAMESPACE {
+namespace libc_gpu_benchmarks {
+
+// A class to log to standard output in the context of hermetic tests.
+struct TestLogger {
+  constexpr TestLogger() = default;
+  template <typename T> TestLogger &operator<<(T);
+};
+
+// A global TestLogger instance to be used in tests.
+extern TestLogger tlog;
+
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
+
+#endif /* LLVM_LIBC_TEST_UNITTEST_TESTLOGGER_H */
diff --git a/libc/benchmarks/gpu/dummy.cpp b/libc/benchmarks/gpu/dummy.cpp
new file mode 100644
index 0000000000000..6d15f98fca220
--- /dev/null
+++ b/libc/benchmarks/gpu/dummy.cpp
@@ -0,0 +1,42 @@
+#include "LibcGpuBenchmark.h"
+#include "timing/timing.h"
+
+#include "src/stdio/fputs.h"
+
+int add_test(int x) {
+    return x + 1;
+}
+
+__attribute__((noinline)) [[gnu::noinline]] int function_call_overhead(int x) {
+    asm volatile ("");
+    return x;
+}
+
+// void DummyHiBenchmark() {
+//     LIBC_NAMESPACE::fputs("Hi\n", stderr);
+// }
+// BENCHMARK_FN(Dummy, DummyHiBenchmark, DummyHiBenchmark);
+
+// void DummyV2Benchmark() {
+//     int result = dummy_hi(10);
+//     asm volatile("" :: "r"(result));
+//     auto test_cycles = LIBC_NAMESPACE::latency(dummy_hi, 10);
+//     LIBC_NAMESPACE::libc_gpu_benchmarks::tlog << "In func: " << test_cycles << '\n';
+// }
+// BENCHMARK_FN(Dummy, DummyV2Benchmark, DummyV2Benchmark);
+
+// BENCHMARK_SINGLE_INPUT_OUTPUT(Dummy, DummySingleInputOutput, dummy_hi, 10);
+
+// BENCHMARK_S_I_O_V2(Dummy, DummySIOMacro, dummy_hi, 10);
+
+// uint64_t DummyWrapperBenchmark() {
+//     int x = 10;
+//     return LIBC_NAMESPACE::latency(add_test, x);
+// }
+// BENCHMARK_WRAPPER(Dummy, DummyWrapperBenchmark, DummyWrapperBenchmark);
+
+uint64_t DummyFunctionCallOverhead() {
+    int x = 10;
+    return LIBC_NAMESPACE::latency(function_call_overhead, x);
+}
+BENCHMARK_WRAPPER(Dummy, DummyFunctionCallOverhead, DummyFunctionCallOverhead);
diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt
new file mode 100644
index 0000000000000..f15d082e4dd2b
--- /dev/null
+++ b/libc/benchmarks/gpu/src/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(ctype)
+add_subdirectory(math)
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
new file mode 100644
index 0000000000000..ab2f6cdf0c7fd
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_custom_target(libc-gpu-ctype-benchmarks)
+
+add_gpu_benchmark(
+    isalnum_benchmark
+    SUITE
+        libc-gpu-ctype-benchmarks
+    SRCS
+        isalnum_benchmark.cpp
+    DEPENDS
+        libc.src.ctype.isalnum
+)
+
+add_gpu_benchmark(
+    isalpha_benchmark
+    SUITE
+        libc-gpu-ctype-benchmarks
+    SRCS
+        isalpha_benchmark.cpp
+    DEPENDS
+        libc.src.ctype.isalpha
+)
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
new file mode 100644
index 0000000000000..071675bb887b6
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -0,0 +1,24 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/ctype/isalnum.h"
+
+// BENCHMARK_SINGLE_INPUT_OUTPUT(LlvmLibcIsAlNumGpuBenchmark,
+//                               IsAlnumSingleInputOutput,
+//                               LIBC_NAMESPACE::isalnum, 'c');
+
+// void BM_IsAlnumBasic() { bool isAlpha = LIBC_NAMESPACE::isalnum('c'); }
+// BENCHMARK_FN(LlvmLibcIsAlNumGpuBenchmark, IsAlnumC, BM_IsAlnumBasic);
+
+uint64_t BM_IsAlnumWrapper() {
+  char x = 'c';
+  return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
+}
+BENCHMARK_WRAPPER(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper,
+                  BM_IsAlnumWrapper);
+
+uint64_t BM_IsAlnumWithOverhead() {
+  char x = 'c';
+  return LIBC_NAMESPACE::function_call_latency(LIBC_NAMESPACE::isalnum, x);
+}
+BENCHMARK_WRAPPER(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWithOverhead,
+                  BM_IsAlnumWithOverhead);
diff --git a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
new file mode 100644
index 0000000000000..e432b1cf72c5f
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
@@ -0,0 +1,9 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/ctype/isalpha.h"
+
+uint64_t BM_IsAlpha() {
+  char x = 'c';
+  return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
+}
+BENCHMARK_WRAPPER(LlvmLibcIsAlphaGpuBenchmark, IsAlpha, BM_IsAlpha);
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/libc/benchmarks/gpu/timing/CMakeLists.txt b/libc/benchmarks/gpu/timing/CMakeLists.txt
new file mode 100644
index 0000000000000..4a65b25005bab
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/CMakeLists.txt
@@ -0,0 +1,20 @@
+message(STATUS "In GPU subdir")
+# if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+#     message(STATUS "Target arch is NOT gpu")
+#     return()
+# endif()
+
+foreach(target nvptx)
+    add_subdirectory(${target})
+    list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing)
+endforeach()
+
+message(STATUS "GPU TIMING: ${target_gpu_timing}")
+
+add_header_library(
+    timing
+    HDRS
+        timing.h
+    DEPENDS
+        ${target_gpu_timing}
+)
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
new file mode 100644
index 0000000000000..7343ab7791197
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -0,0 +1,8 @@
+message(STATUS "IN nvptx dir")
+add_header_library(
+  nvptx_timing
+  HDRS
+    timing.h
+  DEPENDS
+    libc.src.__support.common
+)
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
new file mode 100644
index 0000000000000..39c23d596b7f3
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -0,0 +1,175 @@
+//===------------- NVPTX implementation of timing utils ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+
+// Returns the overhead associated with calling the profiling region. This
+// allows us to substract the constant-time overhead from the latency to
+// obtain a true result. This can vary with system load.
+[[gnu::noinline]] static uint64_t overhead() {
+  volatile uint32_t x = 1;
+  uint32_t y = x;
+  gpu::sync_threads();
+  uint64_t start = gpu::processor_clock();
+  asm volatile("" ::"r"(y), "llr"(start));
+  uint32_t result = y;
+  asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  uint64_t stop = gpu::processor_clock();
+  gpu::sync_threads();
+  volatile auto storage = result;
+  return stop - start;
+}
+
+// Stimulate a simple function and obtain its latency in clock cycles on the
+// system. This function cannot be inlined or else it will disturb the very
+// deliccate balance of hard-coded dependencies.
+//
+// FIXME: This does not work in general on NVPTX because of further
+// optimizations ptxas performs. The only way to get consistent results is to
+// pass and extra "SHELL:-Xcuda-ptxas -O0" to CMake's compiler flag. This
+// negatively implacts performance but it is at least stable.
+template <typename F, typename T>
+[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
+  // We need to store the input somewhere to guarantee that the compiler will
+  // not constant propagate it and remove the profiling region.
+  volatile T storage = t;
+  T arg = storage;
+  asm volatile("" ::"r"(arg));
+
+  // Get the current timestamp from the clock.
+  gpu::sync_threads();
+  __nvvm_membar_sys();
+  uint64_t start = gpu::processor_clock();
+
+  // This forces the compiler to load the input argument and run the clock cycle
+  // counter before the profiling region.
+  asm volatile("" ::"r"(arg), "llr"(start));
+
+  // Run the function under test and return its value.
+  auto result = f(arg);
+
+  // This inline assembly performs a no-op which forces the result to both be
+  // used and prevents us from exiting this region before it's complete.
+  asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+
+  // Obtain the current timestamp after running the calculation and force
+  // ordering.
+  uint64_t stop = gpu::processor_clock();
+  __nvvm_membar_sys();
+  gpu::sync_threads();
+  asm volatile("" ::"r"(stop));
+  volatile T output = result;
+
+  // Return the time elapsed.
+  return stop - start;
+}
+
+[[gnu::noinline]] static uint64_t single_input_function(int x) {
+  asm volatile("" :: "r"(x)); // prevent the compiler from optimizing out x
+  return x;
+}
+
+template <typename F, typename T>
+static LIBC_INLINE uint64_t function_call_latency(F f, T t) {
+  auto function_call_overhead = latency(single_input_function, 0);
+  return latency(f, t) - function_call_overhead;
+}
+
+static LIBC_INLINE uint64_t latency(void (*f)()) {
+  // Get the current timestamp from the clock.
+  gpu::sync_threads();
+  uint64_t start = gpu::processor_clock();
+
+  // This forces the compiler to load the input argument and run the clock cycle
+  // counter before the profiling region.
+  asm volatile("" ::"llr"(start));
+
+  // Run the function under test and return its value.
+  f();
+
+  // Obtain the current timestamp after running the calculation and force
+  // ordering.
+  uint64_t stop = gpu::processor_clock();
+  gpu::sync_threads();
+
+  // Return the time elapsed.
+  return stop - start;
+}
+
+template <typename F, typename T1, typename T2>
+static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
+  volatile T1 storage = t1;
+  volatile T2 storage2 = t2;
+  T1 arg = storage;
+  T2 arg2 = storage2;
+  asm volatile("" ::"r"(arg), "r"(arg2));
+
+  gpu::sync_threads();
+  uint64_t start = gpu::processor_clock();
+
+  asm volatile("" ::"r"(arg), "r"(arg2), "llr"(start));
+
+  auto result = f(arg, arg2);
+
+  asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+
+  uint64_t stop = gpu::processor_clock();
+  gpu::sync_threads();
+  asm volatile("" ::"r"(stop));
+  volatile auto output = result;
+
+  return stop - start;
+}
+
+} // namespace LIBC_NAMESPACE
+
+/**
+ * LatencyP must be a pointer to a uint64_t holding the result variable
+ */
+#define SINGLE_INPUT_OUTPUT_LATENCY(Func, t, LatencyP)                         \
+  do {                                                                         \
+    tlog << "Latency: " << ((long)(*LatencyP)) << '\n';                        \
+    *LatencyP = 200;                                                           \
+    volatile auto storage = t;                                                 \
+    auto arg = storage;                                                        \
+    asm volatile("" ::"r"(arg), "r"(LatencyP));                                \
+                                                                               \
+    LIBC_NAMESPACE::gpu::sync_threads();                                       \
+    uint64_t start = LIBC_NAMESPACE::gpu::processor_clock();                   \
+                                                                               \
+    asm volatile("" ::"r"(arg), "llr"(start));                                 \
+    auto result = Func(arg);                                                   \
+    asm volatile("" ::"r"(LatencyP));                                          \
+    *LatencyP = 312;                                                           \
+    asm volatile("" ::"r"(LatencyP));                                          \
+    asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);     \
+    asm volatile("" ::"r"(LatencyP));                                          \
+    *LatencyP = 499;                                                           \
+                                                                               \
+    uint64_t stop = gpu::processor_clock();                                    \
+    gpu::sync_threads();                                                       \
+    volatile auto output = result;                                             \
+                                                                               \
+    tlog << "Start: " << start << '\n';                                        \
+    tlog << "Stop: " << stop << '\n';                                          \
+    tlog << "Diff: " << (stop - start) << '\n';                                \
+    asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(LatencyP) :);   \
+    *LatencyP = stop - start;                                                  \
+  } while (0)
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
diff --git a/libc/benchmarks/gpu/timing/timing.h b/libc/benchmarks/gpu/timing/timing.h
new file mode 100644
index 0000000000000..45201e56964e6
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/timing.h
@@ -0,0 +1,22 @@
+//===------------- Implementation of GPU timing utils -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_H
+#define LLVM_LIBC_UTILS_GPU_TIMING_H
+
+#include "src/__support/macros/properties/architectures.h"
+
+#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+#error "amdgpu not yet supported
+#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
+#include "nvptx/timing.h"
+#else
+#error "unsupported platform"
+#endif
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_H
\ No newline at end of file

>From 89c13395da99a54d00ecc276c329e1e15a20406d Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 12 May 2024 13:45:10 -0400
Subject: [PATCH 2/3] clean up

---
 libc/CMakeLists.txt                           |   2 -
 libc/benchmarks/gpu/BenchmarkLogger.cpp       |  89 ++++++++++++
 .../gpu/{TestLogger.h => BenchmarkLogger.h}   |  14 +-
 libc/benchmarks/gpu/CMakeLists.txt            |  11 +-
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp      |  22 +--
 libc/benchmarks/gpu/LibcGpuBenchmark.h        | 128 +++---------------
 libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp  |   2 +-
 libc/benchmarks/gpu/TestLogger.cpp            |  89 ------------
 libc/benchmarks/gpu/dummy.cpp                 |  41 ++----
 libc/benchmarks/gpu/timing/CMakeLists.txt     |   8 --
 .../gpu/timing/nvptx/CMakeLists.txt           |   1 -
 11 files changed, 137 insertions(+), 270 deletions(-)
 create mode 100644 libc/benchmarks/gpu/BenchmarkLogger.cpp
 rename libc/benchmarks/gpu/{TestLogger.h => BenchmarkLogger.h} (66%)
 delete mode 100644 libc/benchmarks/gpu/TestLogger.cpp

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index ba6eadd207a4f..175efd89d67e6 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -400,9 +400,7 @@ if(LLVM_INCLUDE_TESTS)
   add_subdirectory(fuzzing)
 endif()
 
-message(STATUS "Checking on variable: ${LIBC_INCLUDE_BENCHMARKS}")
 if(LIBC_INCLUDE_BENCHMARKS)
-	message(STATUS "including libc benchmarks")
   add_subdirectory(benchmarks)
 endif()
 
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
new file mode 100644
index 0000000000000..94a0d897c9585
--- /dev/null
+++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp
@@ -0,0 +1,89 @@
+#include "benchmarks/gpu/BenchmarkLogger.h"
+#include "src/__support/CPP/string.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/io.h"               // write_to_stderr
+#include "src/__support/big_int.h"                 // is_big_int
+#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
+#include "src/__support/uint128.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+namespace libc_gpu_benchmarks {
+
+// cpp::string_view specialization
+template <>
+BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) {
+  LIBC_NAMESPACE::write_to_stderr(str);
+  return *this;
+}
+
+// cpp::string specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) {
+  return *this << static_cast<cpp::string_view>(str);
+}
+
+// const char* specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) {
+  return *this << cpp::string_view(str);
+}
+
+// char* specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<< <char *>(char *str) {
+  return *this << cpp::string_view(str);
+}
+
+// char specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) {
+  return *this << cpp::string_view(&ch, 1);
+}
+
+// bool specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) {
+  return *this << (cond ? "true" : "false");
+}
+
+// void * specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) {
+  return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr));
+}
+
+template <typename T> BenchmarkLogger &BenchmarkLogger::operator<<(T t) {
+  if constexpr (is_big_int_v<T> ||
+                (cpp::is_integral_v<T> && cpp::is_unsigned_v<T> &&
+                 (sizeof(T) > sizeof(uint64_t)))) {
+    static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt");
+    const IntegerToString<T, radix::Hex::WithPrefix> buffer(t);
+    return *this << buffer.view();
+  } else {
+    return *this << cpp::to_string(t);
+  }
+}
+
+// is_integral specializations
+// char is already specialized to handle character
+template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short);
+template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int);
+template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long);
+template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned char>(unsigned char);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned short>(unsigned short);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned int>(unsigned int);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned long>(unsigned long);
+template BenchmarkLogger &
+    BenchmarkLogger::operator<< <unsigned long long>(unsigned long long);
+
+#ifdef LIBC_TYPES_HAS_INT128
+template BenchmarkLogger &BenchmarkLogger::operator<< <__uint128_t>(__uint128_t);
+#endif // LIBC_TYPES_HAS_INT128
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<256>>(UInt<256>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>);
+
+// TODO: Add floating point formatting once it's supported by StringStream.
+
+BenchmarkLogger blog;
+
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/TestLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h
similarity index 66%
rename from libc/benchmarks/gpu/TestLogger.h
rename to libc/benchmarks/gpu/BenchmarkLogger.h
index 68690ce0c3940..ed3cc97e59c6d 100644
--- a/libc/benchmarks/gpu/TestLogger.h
+++ b/libc/benchmarks/gpu/BenchmarkLogger.h
@@ -6,22 +6,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_TEST_UNITTEST_TESTLOGGER_H
-#define LLVM_LIBC_TEST_UNITTEST_TESTLOGGER_H
+#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
+#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
 
 namespace LIBC_NAMESPACE {
 namespace libc_gpu_benchmarks {
 
 // A class to log to standard output in the context of hermetic tests.
-struct TestLogger {
-  constexpr TestLogger() = default;
-  template <typename T> TestLogger &operator<<(T);
+struct BenchmarkLogger {
+  constexpr BenchmarkLogger() = default;
+  template <typename T> BenchmarkLogger &operator<<(T);
 };
 
 // A global TestLogger instance to be used in tests.
-extern TestLogger tlog;
+extern BenchmarkLogger blog;
 
 } // namespace libc_gpu_benchmarks
 } // namespace LIBC_NAMESPACE
 
-#endif /* LLVM_LIBC_TEST_UNITTEST_TESTLOGGER_H */
+#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 2f258da8a3297..5920c5b5e5dfc 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -26,7 +26,6 @@ function (add_gpu_benchmark test_name)
   get_fq_target_name(${test_name} fq_target_name)
   get_fq_target_name(${test_name}.libc fq_libc_target_name) # Stores the compiled libc + infrastructure archive to link in
   get_fq_deps_list(fq_deps_list ${GPU_BENCHMARK_DEPENDS})
-  message(STATUS "Depends: ${fq_deps_list}")
   list(APPEND fq_deps_list
       # Hermetic tests use the platform's startup object. So, their deps also
       # have to be collected.
@@ -52,12 +51,10 @@ function (add_gpu_benchmark test_name)
     if(LIBC_CMAKE_VERBOSE_LOGGING)
       set(msg "Skipping hermetic test ${fq_target_name} as it has missing deps: "
               "${skipped_entrypoints_list}.")
-      message(STATUS ${msg})
     endif()
     return()
   endif()
   list(REMOVE_DUPLICATES link_object_files)
-  message(STATUS ${link_object_files})
 
   # Make a library of all deps
   add_library(
@@ -88,7 +85,6 @@ function (add_gpu_benchmark test_name)
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
   _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
-  message(STATUS "adding compile: ${compile_options}")
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options} -save-temps)
 
   set(link_libraries "")
@@ -100,8 +96,6 @@ function (add_gpu_benchmark test_name)
     endif()
   endforeach()
 
-  message(STATUS "IS nvptx: ${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}")
-
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT}
@@ -116,7 +110,6 @@ function (add_gpu_benchmark test_name)
       "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
-    message(STATUS "ARCH: ${LIBC_GPU_TARGET_ARCHITECTURE}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
     target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
   else()
@@ -170,10 +163,10 @@ add_unittest_framework_library(
   SRCS
     LibcGpuBenchmark.cpp
     LibcGpuBenchmarkMain.cpp
-    TestLogger.cpp
+    BenchmarkLogger.cpp
   HDRS
     LibcGpuBenchmark.h
-    TestLogger.h
+    BenchmarkLogger.h
   DEPENDS
     libc.src.__support.big_int
     libc.src.__support.c_string
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 46e8f904e0643..f46f4a08362d8 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -10,24 +10,24 @@
 namespace LIBC_NAMESPACE {
 namespace libc_gpu_benchmarks {
 
-Test *Test::Start = nullptr;
-Test *Test::End = nullptr;
+Benchmark *Benchmark::Start = nullptr;
+Benchmark *Benchmark::End = nullptr;
 
-void Test::addTest(Test *T) {
+void Benchmark::addBenchmark(Benchmark *B) {
   if (End == nullptr) {
-    Start = T;
-    End = T;
+    Start = B;
+    End = B;
     return;
   }
 
-  End->Next = T;
-  End = T;
+  End->Next = B;
+  End = B;
 }
 
-int Test::runTests() {
-  for (Test *T = Start; T != nullptr; T = T->Next) {
-    tlog << T->getName() << "\n";
-    T->Run();
+int Benchmark::runBenchmarks() {
+  for (Benchmark *B = Start; B != nullptr; B = B->Next) {
+    tlog << B->getName() << "\n";
+    B->Run();
   }
 
   return 0;
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 12695c4d18684..c9e4014808dc0 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -3,7 +3,7 @@
 
 #include "benchmarks/gpu/timing/timing.h"
 
-#include "benchmarks/gpu/TestLogger.h"
+#include "benchmarks/gpu/BenchmarkLogger.h"
 #include "src/__support/CPP/string.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/OSUtil/io.h"
@@ -100,107 +100,57 @@ uint64_t benchmark(const BenchmarkOptions &Options, F f, Args... args) {
 
     Iterations *= Options.ScalingFactor;
   }
-  // for (int i = 0; i < 3; i++) {
-  //   uint64_t result = latency(f, args...);
-  //   BestGuess = result;
-  //   write_to_stderr(cpp::to_string(result));
-  //   write_to_stderr(cpp::string_view("\n"));
-  // }
   tlog << "Best Guess: " << BestGuess << '\n';
   tlog << "Samples: " << Samples << '\n';
+  tlog << "\n";
   return BestGuess;
 };
 
 uint64_t benchmark_wrapper(const BenchmarkOptions &Options,
                            uint64_t (*WrapperFunc)());
 
-template <typename F, typename Arg>
-uint64_t benchmark_macro(const BenchmarkOptions &Options, F f, Arg arg) {
-  RuntimeEstimationProgression REP;
-  size_t Iterations = Options.InitialIterations;
-  if (Iterations < (uint32_t)1) {
-    Iterations = 1;
-  }
-  size_t Samples = 0;
-  uint64_t BestGuess = 0;
-  uint64_t TotalCycles = 0;
-  for (;;) {
-    uint64_t SampleCycles = 0;
-    for (uint32_t i = 0; i < Iterations; i++) {
-      uint64_t result = 0;
-      SINGLE_INPUT_OUTPUT_LATENCY(f, arg, &result);
-      SampleCycles += result;
-      tlog << "Macro: " << result << '\n';
-    }
-
-    Samples++;
-    TotalCycles += SampleCycles;
-    const double ChangeRatio =
-        REP.ComputeImprovement({Iterations, SampleCycles});
-    BestGuess = REP.CurrentEstimation;
-
-    if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
-      break;
-    } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
-      tlog << "Samples are stable!\n";
-      break;
-    }
-
-    Iterations *= Options.ScalingFactor;
-  }
-  // for (int i = 0; i < 3; i++) {
-  //   uint64_t result = latency(f, args...);
-  //   BestGuess = result;
-  //   write_to_stderr(cpp::to_string(result));
-  //   write_to_stderr(cpp::string_view("\n"));
-  // }
-  tlog << "Macro Best Guess: " << BestGuess << '\n';
-  tlog << "Samples: " << Samples << '\n';
-  return BestGuess;
-};
-
-class Test {
-  Test *Next = nullptr;
+class Benchmark {
+  Benchmark *Next = nullptr;
 
 public:
-  virtual ~Test() {}
+  virtual ~Benchmark() {}
   virtual void SetUp() {}
   virtual void TearDown() {}
 
-  static int runTests();
+  static int runBenchmarks();
 
 protected:
-  static void addTest(Test *);
+  static void addBenchmark(Benchmark *);
 
 private:
   virtual void Run() = 0;
   virtual const char *getName() const = 0;
 
-  static Test *Start;
-  static Test *End;
+  static Benchmark *Start;
+  static Benchmark *End;
 };
 
-template <typename F> class FunctionBenchmark : public Test {
+template <typename F> class FunctionBenchmark : public Benchmark {
   F Func;
   const char *Name;
 
 public:
   FunctionBenchmark(F Func, char const *Name) : Func(Func), Name(Name) {
-    addTest(this);
+    addBenchmark(this);
   }
 
 private:
   void Run() override {
     BenchmarkOptions Options;
     auto latency = benchmark(Options, Func);
-    LIBC_NAMESPACE::libc_gpu_benchmarks::tlog << "FnName: " << Name << '\n';
-    LIBC_NAMESPACE::libc_gpu_benchmarks::tlog << "FnBenchmark: " << latency
-                                              << '\n';
+    tlog << "FnName: " << Name << '\n';
+    tlog << "FnBenchmark: " << latency << '\n';
+    tlog << "\n";
   }
   const char *getName() const override { return Name; }
 };
 
-class WrapperBenchmark : public Test {
+class WrapperBenchmark : public Benchmark {
   using BenchmarkWrapperFunction = uint64_t (*)();
   BenchmarkWrapperFunction Func;
   const char *Name;
@@ -208,22 +158,17 @@ class WrapperBenchmark : public Test {
 public:
   WrapperBenchmark(BenchmarkWrapperFunction Func, char const *Name)
       : Func(Func), Name(Name) {
-    addTest(this);
+    addBenchmark(this);
   }
 
 private:
   void Run() override {
     tlog << "Running wrapper: " << Name << '\n';
-    // for (int i = 0; i < 10; i++) {
-    //   auto overhead = LIBC_NAMESPACE::overhead();
-    //   auto result = Func() - overhead;
-    //   tlog << "Result: " << result << '\n';
-    //   tlog << "Overhead: " << overhead << '\n';
-    // }
     BenchmarkOptions Options;
     auto latency = benchmark_wrapper(Options, Func);
     tlog << "FnName: " << Name << '\n';
     tlog << "FnBenchmark: " << latency << '\n';
+    tlog << "\n";
   }
   const char *getName() const override { return Name; }
 };
@@ -232,31 +177,6 @@ class WrapperBenchmark : public Test {
 
 } // namespace LIBC_NAMESPACE
 
-// #define BENCHMARK(SuiteName, TestName)                                         \
-//   class SuiteName##_##TestName                                                 \
-//       : public LIBC_NAMESPACE::libc_gpu_benchmarks::Test {                     \
-//   public:                                                                      \
-//     SuiteName##_##TestName() { addTest(this); }                                \
-//     void Run() override;                                                       \
-//     const char *getName() const override { return #SuiteName "." #TestName; }  \
-//   };                                                                           \
-//   SuiteName##_##TestName SuiteName##_##TestName##_Instance;                    \
-//   void SuiteName##_##TestName::Run()
-
-#define BENCHMARK_SINGLE_INPUT_OUTPUT(SuiteName, TestName, Func, Arg)          \
-  class SuiteName##_##TestName                                                 \
-      : public LIBC_NAMESPACE::libc_gpu_benchmarks::Test {                     \
-  public:                                                                      \
-    SuiteName##_##TestName() { addTest(this); }                                \
-    void Run() override;                                                       \
-    const char *getName() const override { return #SuiteName "." #TestName; }  \
-  };                                                                           \
-  SuiteName##_##TestName SuiteName##_##TestName##_Instance;                    \
-  void SuiteName##_##TestName::Run() {                                         \
-    LIBC_NAMESPACE::libc_gpu_benchmarks::BenchmarkOptions Options;             \
-    LIBC_NAMESPACE::libc_gpu_benchmarks::benchmark(Options, &Func, Arg);       \
-  }
-
 #define BENCHMARK_FN(SuiteName, TestName, Func)                                \
   LIBC_NAMESPACE::libc_gpu_benchmarks::FunctionBenchmark                       \
       SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
@@ -265,18 +185,4 @@ class WrapperBenchmark : public Test {
   LIBC_NAMESPACE::libc_gpu_benchmarks::WrapperBenchmark                        \
       SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
 
-#define BENCHMARK_S_I_O_V2(SuiteName, TestName, Func, Arg)                     \
-  class SuiteName##_##TestName                                                 \
-      : public LIBC_NAMESPACE::libc_gpu_benchmarks::Test {                     \
-  public:                                                                      \
-    SuiteName##_##TestName() { addTest(this); }                                \
-    void Run() override;                                                       \
-    const char *getName() const override { return #SuiteName "." #TestName; }  \
-  };                                                                           \
-  SuiteName##_##TestName SuiteName##_##TestName##_Instance;                    \
-  void SuiteName##_##TestName::Run() {                                         \
-    LIBC_NAMESPACE::libc_gpu_benchmarks::BenchmarkOptions Options;             \
-    LIBC_NAMESPACE::libc_gpu_benchmarks::benchmark_macro(Options, &Func, Arg); \
-  }
-
 #endif
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
index e7e4f08f5af68..c971b00cc9a1b 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
@@ -1,6 +1,6 @@
 #include "LibcGpuBenchmark.h"
 
 extern "C" int main(int argc, char **argv, char **envp) {
-  LIBC_NAMESPACE::libc_gpu_benchmarks::Test::runTests();
+  LIBC_NAMESPACE::libc_gpu_benchmarks::Benchmark::runBenchmarks();
   return 0;
 }
diff --git a/libc/benchmarks/gpu/TestLogger.cpp b/libc/benchmarks/gpu/TestLogger.cpp
deleted file mode 100644
index b3a8399a91adb..0000000000000
--- a/libc/benchmarks/gpu/TestLogger.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-#include "benchmarks/gpu/TestLogger.h"
-#include "src/__support/CPP/string.h"
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/OSUtil/io.h"               // write_to_stderr
-#include "src/__support/big_int.h"                 // is_big_int
-#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
-#include "src/__support/uint128.h"
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE {
-namespace libc_gpu_benchmarks {
-
-// cpp::string_view specialization
-template <>
-TestLogger &TestLogger::operator<< <cpp::string_view>(cpp::string_view str) {
-  LIBC_NAMESPACE::write_to_stderr(str);
-  return *this;
-}
-
-// cpp::string specialization
-template <> TestLogger &TestLogger::operator<< <cpp::string>(cpp::string str) {
-  return *this << static_cast<cpp::string_view>(str);
-}
-
-// const char* specialization
-template <> TestLogger &TestLogger::operator<< <const char *>(const char *str) {
-  return *this << cpp::string_view(str);
-}
-
-// char* specialization
-template <> TestLogger &TestLogger::operator<< <char *>(char *str) {
-  return *this << cpp::string_view(str);
-}
-
-// char specialization
-template <> TestLogger &TestLogger::operator<<(char ch) {
-  return *this << cpp::string_view(&ch, 1);
-}
-
-// bool specialization
-template <> TestLogger &TestLogger::operator<<(bool cond) {
-  return *this << (cond ? "true" : "false");
-}
-
-// void * specialization
-template <> TestLogger &TestLogger::operator<<(void *addr) {
-  return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr));
-}
-
-template <typename T> TestLogger &TestLogger::operator<<(T t) {
-  if constexpr (is_big_int_v<T> ||
-                (cpp::is_integral_v<T> && cpp::is_unsigned_v<T> &&
-                 (sizeof(T) > sizeof(uint64_t)))) {
-    static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt");
-    const IntegerToString<T, radix::Hex::WithPrefix> buffer(t);
-    return *this << buffer.view();
-  } else {
-    return *this << cpp::to_string(t);
-  }
-}
-
-// is_integral specializations
-// char is already specialized to handle character
-template TestLogger &TestLogger::operator<< <short>(short);
-template TestLogger &TestLogger::operator<< <int>(int);
-template TestLogger &TestLogger::operator<< <long>(long);
-template TestLogger &TestLogger::operator<< <long long>(long long);
-template TestLogger &TestLogger::operator<< <unsigned char>(unsigned char);
-template TestLogger &TestLogger::operator<< <unsigned short>(unsigned short);
-template TestLogger &TestLogger::operator<< <unsigned int>(unsigned int);
-template TestLogger &TestLogger::operator<< <unsigned long>(unsigned long);
-template TestLogger &
-    TestLogger::operator<< <unsigned long long>(unsigned long long);
-
-#ifdef LIBC_TYPES_HAS_INT128
-template TestLogger &TestLogger::operator<< <__uint128_t>(__uint128_t);
-#endif // LIBC_TYPES_HAS_INT128
-template TestLogger &TestLogger::operator<< <UInt<128>>(UInt<128>);
-template TestLogger &TestLogger::operator<< <UInt<192>>(UInt<192>);
-template TestLogger &TestLogger::operator<< <UInt<256>>(UInt<256>);
-template TestLogger &TestLogger::operator<< <UInt<320>>(UInt<320>);
-
-// TODO: Add floating point formatting once it's supported by StringStream.
-
-TestLogger tlog;
-
-} // namespace libc_gpu_benchmarks
-} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/dummy.cpp b/libc/benchmarks/gpu/dummy.cpp
index 6d15f98fca220..fa9b220f33b23 100644
--- a/libc/benchmarks/gpu/dummy.cpp
+++ b/libc/benchmarks/gpu/dummy.cpp
@@ -1,42 +1,21 @@
 #include "LibcGpuBenchmark.h"
 #include "timing/timing.h"
 
-#include "src/stdio/fputs.h"
-
-int add_test(int x) {
-    return x + 1;
-}
+int add_test(int x) { return x + 1; }
 
 __attribute__((noinline)) [[gnu::noinline]] int function_call_overhead(int x) {
-    asm volatile ("");
-    return x;
+  asm volatile("");
+  return x;
 }
 
-// void DummyHiBenchmark() {
-//     LIBC_NAMESPACE::fputs("Hi\n", stderr);
-// }
-// BENCHMARK_FN(Dummy, DummyHiBenchmark, DummyHiBenchmark);
-
-// void DummyV2Benchmark() {
-//     int result = dummy_hi(10);
-//     asm volatile("" :: "r"(result));
-//     auto test_cycles = LIBC_NAMESPACE::latency(dummy_hi, 10);
-//     LIBC_NAMESPACE::libc_gpu_benchmarks::tlog << "In func: " << test_cycles << '\n';
-// }
-// BENCHMARK_FN(Dummy, DummyV2Benchmark, DummyV2Benchmark);
-
-// BENCHMARK_SINGLE_INPUT_OUTPUT(Dummy, DummySingleInputOutput, dummy_hi, 10);
-
-// BENCHMARK_S_I_O_V2(Dummy, DummySIOMacro, dummy_hi, 10);
-
-// uint64_t DummyWrapperBenchmark() {
-//     int x = 10;
-//     return LIBC_NAMESPACE::latency(add_test, x);
-// }
-// BENCHMARK_WRAPPER(Dummy, DummyWrapperBenchmark, DummyWrapperBenchmark);
+uint64_t DummyWrapperBenchmark() {
+  int x = 10;
+  return LIBC_NAMESPACE::latency(add_test, x);
+}
+BENCHMARK_WRAPPER(Dummy, DummyWrapperBenchmark, DummyWrapperBenchmark);
 
 uint64_t DummyFunctionCallOverhead() {
-    int x = 10;
-    return LIBC_NAMESPACE::latency(function_call_overhead, x);
+  int x = 10;
+  return LIBC_NAMESPACE::latency(function_call_overhead, x);
 }
 BENCHMARK_WRAPPER(Dummy, DummyFunctionCallOverhead, DummyFunctionCallOverhead);
diff --git a/libc/benchmarks/gpu/timing/CMakeLists.txt b/libc/benchmarks/gpu/timing/CMakeLists.txt
index 4a65b25005bab..0e6a5a6b47968 100644
--- a/libc/benchmarks/gpu/timing/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/CMakeLists.txt
@@ -1,16 +1,8 @@
-message(STATUS "In GPU subdir")
-# if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
-#     message(STATUS "Target arch is NOT gpu")
-#     return()
-# endif()
-
 foreach(target nvptx)
     add_subdirectory(${target})
     list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing)
 endforeach()
 
-message(STATUS "GPU TIMING: ${target_gpu_timing}")
-
 add_header_library(
     timing
     HDRS
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index 7343ab7791197..9958e16206a41 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -1,4 +1,3 @@
-message(STATUS "IN nvptx dir")
 add_header_library(
   nvptx_timing
   HDRS

>From 9273e50a8876b502475aea1547bd84d47c32b55e Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 12 May 2024 16:44:06 -0400
Subject: [PATCH 3/3] clean up experimentation code

---
 libc/benchmarks/CMakeLists.txt                |  6 +-
 libc/benchmarks/gpu/CMakeLists.txt            | 15 +--
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp      | 23 ++---
 libc/benchmarks/gpu/LibcGpuBenchmark.h        | 92 +++----------------
 libc/benchmarks/gpu/dummy.cpp                 | 21 -----
 .../gpu/src/ctype/isalnum_benchmark.cpp       | 24 +++--
 .../gpu/src/ctype/isalpha_benchmark.cpp       |  2 +-
 libc/benchmarks/gpu/timing/nvptx/timing.h     | 67 --------------
 libc/benchmarks/gpu/timing/timing.h           |  3 +-
 9 files changed, 40 insertions(+), 213 deletions(-)
 delete mode 100644 libc/benchmarks/gpu/dummy.cpp

diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 1fb7026d79359..a802e653a091e 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -91,10 +91,10 @@ if(NOT LIBC_TARGET_OS_IS_GPU)
 	)
 	target_link_libraries(libc-benchmark
 	    PUBLIC
-    benchmark::benchmark
-	    LLVMSupport
+	    benchmark::benchmark
+            LLVMSupport
 	    LLVMTargetParser
-    Threads::Threads
+	    Threads::Threads
 	)
 	add_dependencies(libc-benchmark google-benchmark-libc)
 	llvm_update_compile_flags(libc-benchmark)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 5920c5b5e5dfc..a18be27e33573 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -85,7 +85,7 @@ function (add_gpu_benchmark test_name)
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
   _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
-  target_compile_options(${fq_build_target_name} PRIVATE ${compile_options} -save-temps)
+  target_compile_options(${fq_build_target_name} PRIVATE ${compile_options}) 
 
   set(link_libraries "")
   foreach(lib in LISTS GPU_BENCHMARK_LINK_LIBRARIES)
@@ -180,17 +180,4 @@ add_unittest_framework_library(
     libc.benchmarks.gpu.timing.timing
 )
 
-add_custom_target(dummy-suite)
-
-add_gpu_benchmark(
-  dummy
-  SUITE
-    dummy-suite
-  SRCS
-    dummy.cpp
-  DEPENDS
-    libc.src.stdio.fputs
-    libc.src.stdio.stderr
-)
-
 add_subdirectory(src)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index f46f4a08362d8..d37f5a0a53a70 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -1,10 +1,3 @@
-
-#include "benchmarks/gpu/timing/timing.h"
-
-#include "src/__support/CPP/string.h"
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/OSUtil/io.h"
-
 #include "LibcGpuBenchmark.h"
 
 namespace LIBC_NAMESPACE {
@@ -26,16 +19,17 @@ void Benchmark::addBenchmark(Benchmark *B) {
 
 int Benchmark::runBenchmarks() {
   for (Benchmark *B = Start; B != nullptr; B = B->Next) {
-    tlog << B->getName() << "\n";
     B->Run();
   }
 
   return 0;
 }
 
-uint64_t benchmark_wrapper(const BenchmarkOptions &Options,
-                           uint64_t (*WrapperFunc)()) {
+BenchmarkResult benchmark(const BenchmarkOptions &Options,
+                          uint64_t (*WrapperFunc)()) {
+  BenchmarkResult Result;
   RuntimeEstimationProgression REP;
+  size_t TotalIterations = 0;
   size_t Iterations = Options.InitialIterations;
   if (Iterations < (uint32_t)1) {
     Iterations = 1;
@@ -53,6 +47,7 @@ uint64_t benchmark_wrapper(const BenchmarkOptions &Options,
 
     Samples++;
     TotalCycles += SampleCycles;
+    TotalIterations += Iterations;
     const double ChangeRatio =
         REP.ComputeImprovement({Iterations, SampleCycles});
     BestGuess = REP.CurrentEstimation;
@@ -60,15 +55,15 @@ uint64_t benchmark_wrapper(const BenchmarkOptions &Options,
     if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
       break;
     } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
-      tlog << "Samples are stable!\n";
       break;
     }
 
     Iterations *= Options.ScalingFactor;
   }
-  tlog << "Best Guess: " << BestGuess << '\n';
-  tlog << "Samples: " << Samples << '\n';
-  return BestGuess;
+  Result.Cycles = BestGuess;
+  Result.Samples = Samples;
+  Result.TotalIterations = TotalIterations;
+  return Result;
 };
 
 } // namespace libc_gpu_benchmarks
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index c9e4014808dc0..ccbbe3629dbda 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -4,10 +4,8 @@
 #include "benchmarks/gpu/timing/timing.h"
 
 #include "benchmarks/gpu/BenchmarkLogger.h"
-#include "src/__support/CPP/string.h"
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/OSUtil/io.h"
 
+#include <stddef.h>
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE {
@@ -61,53 +59,14 @@ class RuntimeEstimationProgression {
   }
 };
 
-template <typename F, typename... Args>
-uint64_t benchmark(const BenchmarkOptions &Options, F f, Args... args) {
-  RuntimeEstimationProgression REP;
-  size_t Iterations = Options.InitialIterations;
-  if (Iterations < (uint32_t)1) {
-    Iterations = 1;
-  }
+struct BenchmarkResult {
+  uint64_t Cycles = 0;
   size_t Samples = 0;
-  uint64_t BestGuess = 0;
-  uint64_t TotalCycles = 0;
-#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
-  // Nvidia cannot perform LTO, so we need to perform
-  // 1 call to "warm up" the function before microbenchmarking
-  uint64_t result = latency(f, args...);
-  tlog << "Running warm-up iteration: " << result << '\n';
-#endif
-  for (;;) {
-    uint64_t SampleCycles = 0;
-    for (uint32_t i = 0; i < Iterations; i++) {
-      uint64_t result = latency(f, args...);
-      SampleCycles += result;
-      tlog << result << '\n';
-    }
-
-    Samples++;
-    TotalCycles += SampleCycles;
-    const double ChangeRatio =
-        REP.ComputeImprovement({Iterations, SampleCycles});
-    BestGuess = REP.CurrentEstimation;
-
-    if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
-      break;
-    } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
-      tlog << "Samples are stable!\n";
-      break;
-    }
-
-    Iterations *= Options.ScalingFactor;
-  }
-  tlog << "Best Guess: " << BestGuess << '\n';
-  tlog << "Samples: " << Samples << '\n';
-  tlog << "\n";
-  return BestGuess;
+  size_t TotalIterations = 0;
 };
 
-uint64_t benchmark_wrapper(const BenchmarkOptions &Options,
-                           uint64_t (*WrapperFunc)());
+BenchmarkResult benchmark(const BenchmarkOptions &Options,
+                          uint64_t (*WrapperFunc)());
 
 class Benchmark {
   Benchmark *Next = nullptr;
@@ -130,26 +89,6 @@ class Benchmark {
   static Benchmark *End;
 };
 
-template <typename F> class FunctionBenchmark : public Benchmark {
-  F Func;
-  const char *Name;
-
-public:
-  FunctionBenchmark(F Func, char const *Name) : Func(Func), Name(Name) {
-    addBenchmark(this);
-  }
-
-private:
-  void Run() override {
-    BenchmarkOptions Options;
-    auto latency = benchmark(Options, Func);
-    tlog << "FnName: " << Name << '\n';
-    tlog << "FnBenchmark: " << latency << '\n';
-    tlog << "\n";
-  }
-  const char *getName() const override { return Name; }
-};
-
 class WrapperBenchmark : public Benchmark {
   using BenchmarkWrapperFunction = uint64_t (*)();
   BenchmarkWrapperFunction Func;
@@ -163,25 +102,20 @@ class WrapperBenchmark : public Benchmark {
 
 private:
   void Run() override {
-    tlog << "Running wrapper: " << Name << '\n';
     BenchmarkOptions Options;
-    auto latency = benchmark_wrapper(Options, Func);
-    tlog << "FnName: " << Name << '\n';
-    tlog << "FnBenchmark: " << latency << '\n';
-    tlog << "\n";
+    auto result = benchmark(Options, Func);
+    constexpr auto GREEN = "\033[32m";
+    constexpr auto RESET = "\033[0m";
+    blog << GREEN << "[ RUN      ] " << RESET << Name << '\n';
+    blog << GREEN << "[       OK ] " << RESET << Name << ": " << result.Cycles
+         << " cycles, " << result.TotalIterations << " iterations\n";
   }
   const char *getName() const override { return Name; }
 };
-
 } // namespace libc_gpu_benchmarks
-
 } // namespace LIBC_NAMESPACE
 
-#define BENCHMARK_FN(SuiteName, TestName, Func)                                \
-  LIBC_NAMESPACE::libc_gpu_benchmarks::FunctionBenchmark                       \
-      SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
-
-#define BENCHMARK_WRAPPER(SuiteName, TestName, Func)                           \
+#define BENCHMARK(SuiteName, TestName, Func)                                   \
   LIBC_NAMESPACE::libc_gpu_benchmarks::WrapperBenchmark                        \
       SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
 
diff --git a/libc/benchmarks/gpu/dummy.cpp b/libc/benchmarks/gpu/dummy.cpp
deleted file mode 100644
index fa9b220f33b23..0000000000000
--- a/libc/benchmarks/gpu/dummy.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "LibcGpuBenchmark.h"
-#include "timing/timing.h"
-
-int add_test(int x) { return x + 1; }
-
-__attribute__((noinline)) [[gnu::noinline]] int function_call_overhead(int x) {
-  asm volatile("");
-  return x;
-}
-
-uint64_t DummyWrapperBenchmark() {
-  int x = 10;
-  return LIBC_NAMESPACE::latency(add_test, x);
-}
-BENCHMARK_WRAPPER(Dummy, DummyWrapperBenchmark, DummyWrapperBenchmark);
-
-uint64_t DummyFunctionCallOverhead() {
-  int x = 10;
-  return LIBC_NAMESPACE::latency(function_call_overhead, x);
-}
-BENCHMARK_WRAPPER(Dummy, DummyFunctionCallOverhead, DummyFunctionCallOverhead);
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
index 071675bb887b6..8d9c958bb7ed4 100644
--- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -2,23 +2,21 @@
 
 #include "src/ctype/isalnum.h"
 
-// BENCHMARK_SINGLE_INPUT_OUTPUT(LlvmLibcIsAlNumGpuBenchmark,
-//                               IsAlnumSingleInputOutput,
-//                               LIBC_NAMESPACE::isalnum, 'c');
-
-// void BM_IsAlnumBasic() { bool isAlpha = LIBC_NAMESPACE::isalnum('c'); }
-// BENCHMARK_FN(LlvmLibcIsAlNumGpuBenchmark, IsAlnumC, BM_IsAlnumBasic);
-
-uint64_t BM_IsAlnumWrapper() {
+uint64_t BM_IsAlnum() {
   char x = 'c';
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
 }
-BENCHMARK_WRAPPER(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper,
-                  BM_IsAlnumWrapper);
+BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper, BM_IsAlnum);
+
+[[gnu::noinline]] static uint64_t single_input_function(int x) {
+  asm volatile("" ::"r"(x)); // prevent the compiler from optimizing out x
+  return x;
+}
 
 uint64_t BM_IsAlnumWithOverhead() {
   char x = 'c';
-  return LIBC_NAMESPACE::function_call_latency(LIBC_NAMESPACE::isalnum, x);
+  return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x) -
+         LIBC_NAMESPACE::latency(single_input_function, 0);
 }
-BENCHMARK_WRAPPER(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWithOverhead,
-                  BM_IsAlnumWithOverhead);
+BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWithOverhead,
+          BM_IsAlnumWithOverhead);
diff --git a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
index e432b1cf72c5f..2038eb89bc77b 100644
--- a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
@@ -6,4 +6,4 @@ uint64_t BM_IsAlpha() {
   char x = 'c';
   return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
 }
-BENCHMARK_WRAPPER(LlvmLibcIsAlphaGpuBenchmark, IsAlpha, BM_IsAlpha);
+BENCHMARK(LlvmLibcIsAlphaGpuBenchmark, IsAlpha, BM_IsAlpha);
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 39c23d596b7f3..008432e6aa1d2 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -79,38 +79,6 @@ template <typename F, typename T>
   return stop - start;
 }
 
-[[gnu::noinline]] static uint64_t single_input_function(int x) {
-  asm volatile("" :: "r"(x)); // prevent the compiler from optimizing out x
-  return x;
-}
-
-template <typename F, typename T>
-static LIBC_INLINE uint64_t function_call_latency(F f, T t) {
-  auto function_call_overhead = latency(single_input_function, 0);
-  return latency(f, t) - function_call_overhead;
-}
-
-static LIBC_INLINE uint64_t latency(void (*f)()) {
-  // Get the current timestamp from the clock.
-  gpu::sync_threads();
-  uint64_t start = gpu::processor_clock();
-
-  // This forces the compiler to load the input argument and run the clock cycle
-  // counter before the profiling region.
-  asm volatile("" ::"llr"(start));
-
-  // Run the function under test and return its value.
-  f();
-
-  // Obtain the current timestamp after running the calculation and force
-  // ordering.
-  uint64_t stop = gpu::processor_clock();
-  gpu::sync_threads();
-
-  // Return the time elapsed.
-  return stop - start;
-}
-
 template <typename F, typename T1, typename T2>
 static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   volatile T1 storage = t1;
@@ -135,41 +103,6 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
 
   return stop - start;
 }
-
 } // namespace LIBC_NAMESPACE
 
-/**
- * LatencyP must be a pointer to a uint64_t holding the result variable
- */
-#define SINGLE_INPUT_OUTPUT_LATENCY(Func, t, LatencyP)                         \
-  do {                                                                         \
-    tlog << "Latency: " << ((long)(*LatencyP)) << '\n';                        \
-    *LatencyP = 200;                                                           \
-    volatile auto storage = t;                                                 \
-    auto arg = storage;                                                        \
-    asm volatile("" ::"r"(arg), "r"(LatencyP));                                \
-                                                                               \
-    LIBC_NAMESPACE::gpu::sync_threads();                                       \
-    uint64_t start = LIBC_NAMESPACE::gpu::processor_clock();                   \
-                                                                               \
-    asm volatile("" ::"r"(arg), "llr"(start));                                 \
-    auto result = Func(arg);                                                   \
-    asm volatile("" ::"r"(LatencyP));                                          \
-    *LatencyP = 312;                                                           \
-    asm volatile("" ::"r"(LatencyP));                                          \
-    asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);     \
-    asm volatile("" ::"r"(LatencyP));                                          \
-    *LatencyP = 499;                                                           \
-                                                                               \
-    uint64_t stop = gpu::processor_clock();                                    \
-    gpu::sync_threads();                                                       \
-    volatile auto output = result;                                             \
-                                                                               \
-    tlog << "Start: " << start << '\n';                                        \
-    tlog << "Stop: " << stop << '\n';                                          \
-    tlog << "Diff: " << (stop - start) << '\n';                                \
-    asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(LatencyP) :);   \
-    *LatencyP = stop - start;                                                  \
-  } while (0)
-
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
diff --git a/libc/benchmarks/gpu/timing/timing.h b/libc/benchmarks/gpu/timing/timing.h
index 45201e56964e6..f37381fdf65db 100644
--- a/libc/benchmarks/gpu/timing/timing.h
+++ b/libc/benchmarks/gpu/timing/timing.h
@@ -19,4 +19,5 @@
 #error "unsupported platform"
 #endif
 
-#endif // LLVM_LIBC_UTILS_GPU_TIMING_H
\ No newline at end of file
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_H
+