[libc-commits] [libc] [libc] NVPTX Profiling Draft (PR #92009)
via libc-commits
libc-commits at lists.llvm.org
Mon May 13 11:46:48 PDT 2024
https://github.com/jameshu15869 created https://github.com/llvm/llvm-project/pull/92009
Draft PR for adding microbenchmarking infrastructure for NVPTX. `nvlink` cannot perform LTO, so we cannot inline `libc` functions and this function call overhead is not adjusted for during microbenchmarking.
>From 929819c8619d98c49ff6b7295faad7d83f1a956f Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 11 May 2024 17:19:54 -0400
Subject: [PATCH 1/3] initial nvptx microbenchmarking infrastructure
---
libc/CMakeLists.txt | 2 +
libc/benchmarks/CMakeLists.txt | 412 +++++++++---------
libc/benchmarks/gpu/CMakeLists.txt | 203 +++++++++
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 75 ++++
libc/benchmarks/gpu/LibcGpuBenchmark.h | 282 ++++++++++++
libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp | 6 +
libc/benchmarks/gpu/TestLogger.cpp | 89 ++++
libc/benchmarks/gpu/TestLogger.h | 27 ++
libc/benchmarks/gpu/dummy.cpp | 42 ++
libc/benchmarks/gpu/src/CMakeLists.txt | 2 +
libc/benchmarks/gpu/src/ctype/CMakeLists.txt | 21 +
.../gpu/src/ctype/isalnum_benchmark.cpp | 24 +
.../gpu/src/ctype/isalpha_benchmark.cpp | 9 +
libc/benchmarks/gpu/src/math/CMakeLists.txt | 0
libc/benchmarks/gpu/timing/CMakeLists.txt | 20 +
.../gpu/timing/nvptx/CMakeLists.txt | 8 +
libc/benchmarks/gpu/timing/nvptx/timing.h | 175 ++++++++
libc/benchmarks/gpu/timing/timing.h | 22 +
18 files changed, 1216 insertions(+), 203 deletions(-)
create mode 100644 libc/benchmarks/gpu/CMakeLists.txt
create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmark.cpp
create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmark.h
create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
create mode 100644 libc/benchmarks/gpu/TestLogger.cpp
create mode 100644 libc/benchmarks/gpu/TestLogger.h
create mode 100644 libc/benchmarks/gpu/dummy.cpp
create mode 100644 libc/benchmarks/gpu/src/CMakeLists.txt
create mode 100644 libc/benchmarks/gpu/src/ctype/CMakeLists.txt
create mode 100644 libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
create mode 100644 libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
create mode 100644 libc/benchmarks/gpu/src/math/CMakeLists.txt
create mode 100644 libc/benchmarks/gpu/timing/CMakeLists.txt
create mode 100644 libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
create mode 100644 libc/benchmarks/gpu/timing/nvptx/timing.h
create mode 100644 libc/benchmarks/gpu/timing/timing.h
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 175efd89d67e6..ba6eadd207a4f 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -400,7 +400,9 @@ if(LLVM_INCLUDE_TESTS)
add_subdirectory(fuzzing)
endif()
+message(STATUS "Checking on variable: ${LIBC_INCLUDE_BENCHMARKS}")
if(LIBC_INCLUDE_BENCHMARKS)
+ message(STATUS "including libc benchmarks")
add_subdirectory(benchmarks)
endif()
diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 4978da65850cc..1fb7026d79359 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -1,205 +1,211 @@
-find_package(Threads)
-
-set(LLVM_LINK_COMPONENTS
- Support
- TargetParser
- )
-
-#==============================================================================
-# Add Unit Testing Support
-#==============================================================================
-
-function(add_libc_benchmark_unittest target_name)
- if(NOT LLVM_INCLUDE_TESTS)
- return()
- endif()
-
- cmake_parse_arguments(
- "LIBC_BENCHMARKS_UNITTEST"
- "" # No optional arguments
- "SUITE" # Single value arguments
- "SRCS;DEPENDS" # Multi-value arguments
- ${ARGN}
- )
-
- add_executable(${target_name}
- EXCLUDE_FROM_ALL
- ${LIBC_BENCHMARKS_UNITTEST_SRCS}
- )
- target_link_libraries(${target_name}
- PRIVATE
- llvm_gtest_main
- llvm_gtest
- ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
- )
- llvm_update_compile_flags(${target_name})
-
- add_custom_command(
- TARGET ${target_name}
- POST_BUILD
- COMMAND $<TARGET_FILE:${target_name}>
- )
- add_dependencies(libc-benchmark-util-tests ${target_name})
-endfunction()
-
-#==============================================================================
-# Build Google Benchmark for libc
-#==============================================================================
-
-include(ExternalProject)
-ExternalProject_Add(google-benchmark-libc
- EXCLUDE_FROM_ALL ON
- PREFIX google-benchmark-libc
- SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
- INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
- CMAKE_CACHE_ARGS
- -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
- -DBENCHMARK_ENABLE_LTO:BOOL=OFF
- -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
- -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
- -DBENCHMARK_FORCE_WERROR:BOOL=OFF
- -DBENCHMARK_USE_LIBCXX:BOOL=OFF
- -DCMAKE_BUILD_TYPE:STRING=Release
-
- -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
- -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
- -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
- -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
- -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
- -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
-
- -DBUILD_SHARED_LIBS:BOOL=OFF
- -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
-
- -DCMAKE_CXX_STANDARD:STRING=14
- -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
- )
-
-add_custom_target(libc-benchmark-util-tests)
-
-# libc-benchmark
-add_library(libc-benchmark
- STATIC
- EXCLUDE_FROM_ALL
- LibcBenchmark.cpp
- LibcBenchmark.h
-)
-
-target_include_directories(libc-benchmark
- PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
-)
-target_link_libraries(libc-benchmark
- PUBLIC
+if(NOT LIBC_TARGET_OS_IS_GPU)
+ find_package(Threads)
+
+ set(LLVM_LINK_COMPONENTS
+ Support
+ TargetParser
+ )
+
+ #==============================================================================
+ # Add Unit Testing Support
+ #==============================================================================
+
+ function(add_libc_benchmark_unittest target_name)
+ if(NOT LLVM_INCLUDE_TESTS)
+ return()
+ endif()
+
+ cmake_parse_arguments(
+ "LIBC_BENCHMARKS_UNITTEST"
+ "" # No optional arguments
+ "SUITE" # Single value arguments
+ "SRCS;DEPENDS" # Multi-value arguments
+ ${ARGN}
+ )
+
+ add_executable(${target_name}
+ EXCLUDE_FROM_ALL
+ ${LIBC_BENCHMARKS_UNITTEST_SRCS}
+ )
+ target_link_libraries(${target_name}
+ PRIVATE
+ llvm_gtest_main
+ llvm_gtest
+ ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
+ )
+ llvm_update_compile_flags(${target_name})
+
+ add_custom_command(
+ TARGET ${target_name}
+ POST_BUILD
+ COMMAND $<TARGET_FILE:${target_name}>
+ )
+ add_dependencies(libc-benchmark-util-tests ${target_name})
+ endfunction()
+
+ #==============================================================================
+ # Build Google Benchmark for libc
+ #==============================================================================
+
+ include(ExternalProject)
+ ExternalProject_Add(google-benchmark-libc
+ EXCLUDE_FROM_ALL ON
+ PREFIX google-benchmark-libc
+ SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
+ INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
+ CMAKE_CACHE_ARGS
+ -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
+ -DBENCHMARK_ENABLE_LTO:BOOL=OFF
+ -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
+ -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
+ -DBENCHMARK_FORCE_WERROR:BOOL=OFF
+ -DBENCHMARK_USE_LIBCXX:BOOL=OFF
+ -DCMAKE_BUILD_TYPE:STRING=Release
+
+ -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
+ -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
+ -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
+ -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
+ -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
+ -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
+
+ -DBUILD_SHARED_LIBS:BOOL=OFF
+ -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
+
+ -DCMAKE_CXX_STANDARD:STRING=14
+ -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
+ )
+
+ add_custom_target(libc-benchmark-util-tests)
+
+ # libc-benchmark
+ add_library(libc-benchmark
+ STATIC
+ EXCLUDE_FROM_ALL
+ LibcBenchmark.cpp
+ LibcBenchmark.h
+ )
+
+ target_include_directories(libc-benchmark
+ PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
+ )
+ target_link_libraries(libc-benchmark
+ PUBLIC
benchmark::benchmark
- LLVMSupport
- LLVMTargetParser
+ LLVMSupport
+ LLVMTargetParser
Threads::Threads
-)
-add_dependencies(libc-benchmark google-benchmark-libc)
-llvm_update_compile_flags(libc-benchmark)
-
-add_libc_benchmark_unittest(libc-benchmark-test
- SRCS LibcBenchmarkTest.cpp
- DEPENDS libc-benchmark
-)
-
-# libc-memory-benchmark
-add_library(libc-memory-benchmark
- STATIC
- EXCLUDE_FROM_ALL
- LibcMemoryBenchmark.cpp
- LibcMemoryBenchmark.h
- LibcFunctionPrototypes.h
- MemorySizeDistributions.cpp
- MemorySizeDistributions.h
-)
-target_include_directories(libc-memory-benchmark
- PUBLIC
- ${CMAKE_CURRENT_SOURCE_DIR}
-)
-target_link_libraries(libc-memory-benchmark
- PUBLIC
- libc-benchmark
-)
-llvm_update_compile_flags(libc-memory-benchmark)
-
-add_libc_benchmark_unittest(libc-memory-benchmark-test
- SRCS LibcMemoryBenchmarkTest.cpp
- DEPENDS libc-memory-benchmark
-)
-
-# json
-add_library(json
- STATIC
- EXCLUDE_FROM_ALL
- JSON.cpp
- JSON.h
-)
-target_link_libraries(json PUBLIC libc-memory-benchmark)
-llvm_update_compile_flags(json)
-
-add_libc_benchmark_unittest(json-test
- SRCS JSONTest.cpp
- DEPENDS json
-)
-
-#==============================================================================
-# Benchmarking tool
-#==============================================================================
-
-# Benchmark all implementations that can run on the target CPU.
-function(add_libc_multi_impl_benchmark name)
- get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
- foreach(fq_config_name IN LISTS fq_implementations)
- get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
- cpu_supports(can_run "${required_cpu_features}")
- if(can_run)
- set(benchmark_name ${fq_config_name}_benchmark)
- add_executable(${benchmark_name}
- EXCLUDE_FROM_ALL
- LibcMemoryBenchmarkMain.cpp
- )
- get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
- target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
- string(TOUPPER ${name} name_upper)
- target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
- llvm_update_compile_flags(${benchmark_name})
- else()
- message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
- endif()
- endforeach()
-endfunction()
-
-add_libc_multi_impl_benchmark(bcmp)
-add_libc_multi_impl_benchmark(bzero)
-add_libc_multi_impl_benchmark(memcmp)
-add_libc_multi_impl_benchmark(memcpy)
-add_libc_multi_impl_benchmark(memmove)
-add_libc_multi_impl_benchmark(memset)
-
-#==============================================================================
-# Google Benchmarking tool
-#==============================================================================
-
-# This target uses the Google Benchmark facility to report throughput for llvm
-# libc memory functions compiled for the host machine. This is useful to
-# continuously monitor the performance of the memory functions.
-add_executable(libc.benchmarks.memory_functions.opt_host
- EXCLUDE_FROM_ALL
- LibcMemoryGoogleBenchmarkMain.cpp
- LibcDefaultImplementations.cpp
-)
-target_link_libraries(libc.benchmarks.memory_functions.opt_host
- PRIVATE
- libc-memory-benchmark
- libc.src.string.memcmp_opt_host.__internal__
- libc.src.string.bcmp_opt_host.__internal__
- libc.src.string.memcpy_opt_host.__internal__
- libc.src.string.memset_opt_host.__internal__
- libc.src.string.bzero_opt_host.__internal__
- libc.src.string.memmove_opt_host.__internal__
- benchmark_main
-)
-llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host)
-
-add_subdirectory(automemcpy)
+ )
+ add_dependencies(libc-benchmark google-benchmark-libc)
+ llvm_update_compile_flags(libc-benchmark)
+
+ add_libc_benchmark_unittest(libc-benchmark-test
+ SRCS LibcBenchmarkTest.cpp
+ DEPENDS libc-benchmark
+ )
+
+ # libc-memory-benchmark
+ add_library(libc-memory-benchmark
+ STATIC
+ EXCLUDE_FROM_ALL
+ LibcMemoryBenchmark.cpp
+ LibcMemoryBenchmark.h
+ LibcFunctionPrototypes.h
+ MemorySizeDistributions.cpp
+ MemorySizeDistributions.h
+ )
+ target_include_directories(libc-memory-benchmark
+ PUBLIC
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ )
+ target_link_libraries(libc-memory-benchmark
+ PUBLIC
+ libc-benchmark
+ )
+ llvm_update_compile_flags(libc-memory-benchmark)
+
+ add_libc_benchmark_unittest(libc-memory-benchmark-test
+ SRCS LibcMemoryBenchmarkTest.cpp
+ DEPENDS libc-memory-benchmark
+ )
+
+ # json
+ add_library(json
+ STATIC
+ EXCLUDE_FROM_ALL
+ JSON.cpp
+ JSON.h
+ )
+ target_link_libraries(json PUBLIC libc-memory-benchmark)
+ llvm_update_compile_flags(json)
+
+ add_libc_benchmark_unittest(json-test
+ SRCS JSONTest.cpp
+ DEPENDS json
+ )
+
+ #==============================================================================
+ # Benchmarking tool
+ #==============================================================================
+
+ # Benchmark all implementations that can run on the target CPU.
+ function(add_libc_multi_impl_benchmark name)
+ get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
+ foreach(fq_config_name IN LISTS fq_implementations)
+ get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
+ cpu_supports(can_run "${required_cpu_features}")
+ if(can_run)
+ set(benchmark_name ${fq_config_name}_benchmark)
+ add_executable(${benchmark_name}
+ EXCLUDE_FROM_ALL
+ LibcMemoryBenchmarkMain.cpp
+ )
+ get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
+ target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
+ string(TOUPPER ${name} name_upper)
+ target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
+ llvm_update_compile_flags(${benchmark_name})
+ else()
+ message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
+ endif()
+ endforeach()
+ endfunction()
+
+ add_libc_multi_impl_benchmark(bcmp)
+ add_libc_multi_impl_benchmark(bzero)
+ add_libc_multi_impl_benchmark(memcmp)
+ add_libc_multi_impl_benchmark(memcpy)
+ add_libc_multi_impl_benchmark(memmove)
+ add_libc_multi_impl_benchmark(memset)
+
+ #==============================================================================
+ # Google Benchmarking tool
+ #==============================================================================
+
+ # This target uses the Google Benchmark facility to report throughput for llvm
+ # libc memory functions compiled for the host machine. This is useful to
+ # continuously monitor the performance of the memory functions.
+ add_executable(libc.benchmarks.memory_functions.opt_host
+ EXCLUDE_FROM_ALL
+ LibcMemoryGoogleBenchmarkMain.cpp
+ LibcDefaultImplementations.cpp
+ )
+ target_link_libraries(libc.benchmarks.memory_functions.opt_host
+ PRIVATE
+ libc-memory-benchmark
+ libc.src.string.memcmp_opt_host.__internal__
+ libc.src.string.bcmp_opt_host.__internal__
+ libc.src.string.memcpy_opt_host.__internal__
+ libc.src.string.memset_opt_host.__internal__
+ libc.src.string.bzero_opt_host.__internal__
+ libc.src.string.memmove_opt_host.__internal__
+ benchmark_main
+ )
+ llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host)
+
+ add_subdirectory(automemcpy)
+endif()
+
+if(LIBC_TARGET_OS_IS_GPU)
+ add_subdirectory(gpu)
+endif()
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
new file mode 100644
index 0000000000000..2f258da8a3297
--- /dev/null
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -0,0 +1,203 @@
+add_subdirectory(timing)
+
+add_custom_target(gpu-benchmark)
+
+function (add_gpu_benchmark test_name)
+ if(NOT TARGET libc.startup.${LIBC_TARGET_OS}.crt1)
+ message(VERBOSE "Skipping ${fq_target_name} as it is not available on ${LIBC_TARGET_OS}.")
+ return()
+ endif()
+
+ cmake_parse_arguments(
+ "GPU_BENCHMARK"
+ "" # No optional arguments
+ "SUITE" # Single value arguments
+ "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
+ ${ARGN}
+ )
+
+ if(NOT GPU_BENCHMARK_SUITE)
+ message(FATAL_ERROR "SUITE not specified for ${fq_target_name}")
+ endif()
+ if(NOT GPU_BENCHMARK_SRCS)
+ message(FATAL_ERROR "The SRCS list for add_gpu_benchmark is missing.")
+ endif()
+
+ get_fq_target_name(${test_name} fq_target_name)
+ get_fq_target_name(${test_name}.libc fq_libc_target_name) # Stores the compiled libc + infrastructure archive to link in
+ get_fq_deps_list(fq_deps_list ${GPU_BENCHMARK_DEPENDS})
+ message(STATUS "Depends: ${fq_deps_list}")
+ list(APPEND fq_deps_list
+ # Hermetic tests use the platform's startup object. So, their deps also
+ # have to be collected.
+ libc.startup.${LIBC_TARGET_OS}.crt1
+ # We always add the memory functions objects. This is because the
+ # compiler's codegen can emit calls to the C memory functions.
+ libc.src.string.bcmp
+ libc.src.string.bzero
+ libc.src.string.memcmp
+ libc.src.string.memcpy
+ libc.src.string.memmove
+ libc.src.string.memset
+ libc.src.__support.StringUtil.error_to_string
+ )
+
+ list(REMOVE_DUPLICATES fq_deps_list)
+
+ # TODO: Instead of gathering internal object files from entrypoints,
+ # collect the object files with public names of entrypoints.
+ get_object_files_for_test(
+ link_object_files skipped_entrypoints_list ${fq_deps_list})
+ if(skipped_entrypoints_list)
+ if(LIBC_CMAKE_VERBOSE_LOGGING)
+ set(msg "Skipping hermetic test ${fq_target_name} as it has missing deps: "
+ "${skipped_entrypoints_list}.")
+ message(STATUS ${msg})
+ endif()
+ return()
+ endif()
+ list(REMOVE_DUPLICATES link_object_files)
+ message(STATUS ${link_object_files})
+
+ # Make a library of all deps
+ add_library(
+ ${fq_target_name}.__libc__
+ STATIC
+ EXCLUDE_FROM_ALL
+ ${link_object_files}
+ )
+ set_target_properties(${fq_target_name}.__libc__
+ PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+ set_target_properties(${fq_target_name}.__libc__
+ PROPERTIES ARCHIVE_OUTPUT_NAME ${fq_target_name}.libc)
+
+ set(fq_build_target_name ${fq_target_name}.__build__)
+ add_executable(
+ ${fq_build_target_name}
+ EXCLUDE_FROM_ALL
+ $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
+ ${GPU_BENCHMARK_SRCS}
+ ${GPU_BENCHMARK_HDRS}
+ )
+ set_target_properties(${fq_build_target_name}
+ PROPERTIES
+ RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+ )
+
+ _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
+ target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+ target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+ _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
+ message(STATUS "adding compile: ${compile_options}")
+ target_compile_options(${fq_build_target_name} PRIVATE ${compile_options} -save-temps)
+
+ set(link_libraries "")
+ foreach(lib in LISTS GPU_BENCHMARK_LINK_LIBRARIES)
+ if(TARGET ${lib}.hermetic)
+ list(APPEND link_libraries ${lib}.hermetic)
+ else()
+ list(APPEND link_libraries ${lib})
+ endif()
+ endforeach()
+
+ message(STATUS "IS nvptx: ${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}")
+
+ if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+ target_link_options(${fq_build_target_name} PRIVATE
+ ${LIBC_COMPILE_OPTIONS_DEFAULT}
+ -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -Wno-multi-gpu
+ "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
+ "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
+ elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+ # We need to use the internal object versions for NVPTX.
+ set(internal_suffix ".__internal__")
+ target_link_options(${fq_build_target_name} PRIVATE
+ ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
+ "-Wl,--suppress-stack-size-warning"
+ -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
+ "--cuda-path=${LIBC_CUDA_ROOT}")
+ message(STATUS "ARCH: ${LIBC_GPU_TARGET_ARCHITECTURE}")
+ elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
+ target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
+ else()
+ # Older version of gcc does not support `nostdlib++` flag. We use
+ # `nostdlib` and link against libgcc_s, which cannot be linked statically.
+ target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib)
+ list(APPEND link_libraries ${LIBGCC_S_LOCATION})
+ endif()
+
+ # link libraries for the BUILD target (i.e. to compile the test)
+ target_link_libraries(
+ ${fq_build_target_name}
+ PRIVATE
+ libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
+ ${link_libraries}
+ # LibcTest.hermetic
+ LibcGpuBenchmark.hermetic
+ # LibcHermeticTestSupport.hermetic
+ LibcHermeticTestSupport.hermetic
+ # The NVIDIA 'nvlink' linker does not currently support static libraries.
+ $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
+
+ add_dependencies(${fq_build_target_name}
+ LibcGpuBenchmark.hermetic
+ ${fq_deps_list})
+
+ # Tests on the GPU require an external loader utility to launch the kernel.
+ if(TARGET libc.utils.gpu.loader)
+ add_dependencies(${fq_build_target_name} libc.utils.gpu.loader)
+ get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE")
+ endif()
+
+ set(test_cmd ${GPU_BENCHMARK_ENV}
+ $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${GPU_BENCHMARK_LOADER_ARGS}
+ $<TARGET_FILE:${fq_build_target_name}> ${GPU_BENCHMARK_ARGS})
+ add_custom_target(
+ ${fq_target_name}
+ COMMAND ${test_cmd}
+ COMMAND_EXPAND_LISTS
+ COMMENT "Running GPU benchmark ${fq_target_name}"
+ )
+
+ # Make this benchmark part of its suite
+ add_dependencies(${GPU_BENCHMARK_SUITE} ${fq_target_name})
+ # Remember to make this benchmark part of the umbrella command
+ add_dependencies(gpu-benchmark ${fq_target_name})
+endfunction(add_gpu_benchmark)
+
+add_unittest_framework_library(
+ LibcGpuBenchmark
+ SRCS
+ LibcGpuBenchmark.cpp
+ LibcGpuBenchmarkMain.cpp
+ TestLogger.cpp
+ HDRS
+ LibcGpuBenchmark.h
+ TestLogger.h
+ DEPENDS
+ libc.src.__support.big_int
+ libc.src.__support.c_string
+ libc.src.__support.CPP.string
+ libc.src.__support.CPP.string_view
+ libc.src.__support.CPP.type_traits
+ libc.src.__support.fixed_point.fx_rep
+ libc.src.__support.macros.properties.types
+ libc.src.__support.OSUtil.osutil
+ libc.src.__support.uint128
+ libc.benchmarks.gpu.timing.timing
+)
+
+add_custom_target(dummy-suite)
+
+add_gpu_benchmark(
+ dummy
+ SUITE
+ dummy-suite
+ SRCS
+ dummy.cpp
+ DEPENDS
+ libc.src.stdio.fputs
+ libc.src.stdio.stderr
+)
+
+add_subdirectory(src)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
new file mode 100644
index 0000000000000..46e8f904e0643
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -0,0 +1,75 @@
+
+#include "benchmarks/gpu/timing/timing.h"
+
+#include "src/__support/CPP/string.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/io.h"
+
+#include "LibcGpuBenchmark.h"
+
+namespace LIBC_NAMESPACE {
+namespace libc_gpu_benchmarks {
+
+Test *Test::Start = nullptr;
+Test *Test::End = nullptr;
+
+void Test::addTest(Test *T) {
+ if (End == nullptr) {
+ Start = T;
+ End = T;
+ return;
+ }
+
+ End->Next = T;
+ End = T;
+}
+
+int Test::runTests() {
+ for (Test *T = Start; T != nullptr; T = T->Next) {
+ tlog << T->getName() << "\n";
+ T->Run();
+ }
+
+ return 0;
+}
+
+uint64_t benchmark_wrapper(const BenchmarkOptions &Options,
+ uint64_t (*WrapperFunc)()) {
+ RuntimeEstimationProgression REP;
+ size_t Iterations = Options.InitialIterations;
+ if (Iterations < (uint32_t)1) {
+ Iterations = 1;
+ }
+ size_t Samples = 0;
+ uint64_t BestGuess = 0;
+ uint64_t TotalCycles = 0;
+ for (;;) {
+ uint64_t SampleCycles = 0;
+ for (uint32_t i = 0; i < Iterations; i++) {
+ auto overhead = LIBC_NAMESPACE::overhead();
+ uint64_t result = WrapperFunc() - overhead;
+ SampleCycles += result;
+ }
+
+ Samples++;
+ TotalCycles += SampleCycles;
+ const double ChangeRatio =
+ REP.ComputeImprovement({Iterations, SampleCycles});
+ BestGuess = REP.CurrentEstimation;
+
+ if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
+ break;
+ } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
+ tlog << "Samples are stable!\n";
+ break;
+ }
+
+ Iterations *= Options.ScalingFactor;
+ }
+ tlog << "Best Guess: " << BestGuess << '\n';
+ tlog << "Samples: " << Samples << '\n';
+ return BestGuess;
+};
+
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
new file mode 100644
index 0000000000000..12695c4d18684
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -0,0 +1,282 @@
+#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
+#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
+
+#include "benchmarks/gpu/timing/timing.h"
+
+#include "benchmarks/gpu/TestLogger.h"
+#include "src/__support/CPP/string.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/io.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+
+namespace libc_gpu_benchmarks {
+
+struct BenchmarkOptions {
+ uint32_t InitialIterations = 1;
+ uint32_t MaxIterations = 10000000;
+ uint32_t MinSamples = 4;
+ uint32_t MaxSamples = 1000;
+ double Epsilon = 0.01;
+ double ScalingFactor = 1.4;
+};
+
+struct Measurement {
+ size_t Iterations = 0;
+ uint64_t ElapsedCycles = 0;
+};
+
+class RefinableRuntimeEstimation {
+ uint64_t TotalCycles = 0;
+ size_t TotalIterations = 0;
+
+public:
+ uint64_t Update(const Measurement &M) {
+ TotalCycles += M.ElapsedCycles;
+ TotalIterations += M.Iterations;
+ return TotalCycles / TotalIterations;
+ }
+};
+
+// Tracks the progression of the runtime estimation
+class RuntimeEstimationProgression {
+ RefinableRuntimeEstimation RRE;
+
+public:
+ uint64_t CurrentEstimation = 0;
+
+ double ComputeImprovement(const Measurement &M) {
+ const uint64_t NewEstimation = RRE.Update(M);
+ double Ratio = ((double)CurrentEstimation / NewEstimation) - 1.0;
+
+ // Get absolute value
+ if (Ratio < 0) {
+ Ratio *= -1;
+ }
+
+ CurrentEstimation = NewEstimation;
+ return Ratio;
+ }
+};
+
+template <typename F, typename... Args>
+uint64_t benchmark(const BenchmarkOptions &Options, F f, Args... args) {
+ RuntimeEstimationProgression REP;
+ size_t Iterations = Options.InitialIterations;
+ if (Iterations < (uint32_t)1) {
+ Iterations = 1;
+ }
+ size_t Samples = 0;
+ uint64_t BestGuess = 0;
+ uint64_t TotalCycles = 0;
+#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
+ // Nvidia cannot perform LTO, so we need to perform
+ // 1 call to "warm up" the function before microbenchmarking
+ uint64_t result = latency(f, args...);
+ tlog << "Running warm-up iteration: " << result << '\n';
+#endif
+ for (;;) {
+ uint64_t SampleCycles = 0;
+ for (uint32_t i = 0; i < Iterations; i++) {
+ uint64_t result = latency(f, args...);
+ SampleCycles += result;
+ tlog << result << '\n';
+ }
+
+ Samples++;
+ TotalCycles += SampleCycles;
+ const double ChangeRatio =
+ REP.ComputeImprovement({Iterations, SampleCycles});
+ BestGuess = REP.CurrentEstimation;
+
+ if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
+ break;
+ } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
+ tlog << "Samples are stable!\n";
+ break;
+ }
+
+ Iterations *= Options.ScalingFactor;
+ }
+ // for (int i = 0; i < 3; i++) {
+ // uint64_t result = latency(f, args...);
+ // BestGuess = result;
+ // write_to_stderr(cpp::to_string(result));
+ // write_to_stderr(cpp::string_view("\n"));
+ // }
+ tlog << "Best Guess: " << BestGuess << '\n';
+ tlog << "Samples: " << Samples << '\n';
+ return BestGuess;
+};
+
+uint64_t benchmark_wrapper(const BenchmarkOptions &Options,
+ uint64_t (*WrapperFunc)());
+
+template <typename F, typename Arg>
+uint64_t benchmark_macro(const BenchmarkOptions &Options, F f, Arg arg) {
+ RuntimeEstimationProgression REP;
+ size_t Iterations = Options.InitialIterations;
+ if (Iterations < (uint32_t)1) {
+ Iterations = 1;
+ }
+ size_t Samples = 0;
+ uint64_t BestGuess = 0;
+ uint64_t TotalCycles = 0;
+ for (;;) {
+ uint64_t SampleCycles = 0;
+ for (uint32_t i = 0; i < Iterations; i++) {
+ uint64_t result = 0;
+ SINGLE_INPUT_OUTPUT_LATENCY(f, arg, &result);
+ SampleCycles += result;
+ tlog << "Macro: " << result << '\n';
+ }
+
+ Samples++;
+ TotalCycles += SampleCycles;
+ const double ChangeRatio =
+ REP.ComputeImprovement({Iterations, SampleCycles});
+ BestGuess = REP.CurrentEstimation;
+
+ if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
+ break;
+ } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
+ tlog << "Samples are stable!\n";
+ break;
+ }
+
+ Iterations *= Options.ScalingFactor;
+ }
+ // for (int i = 0; i < 3; i++) {
+ // uint64_t result = latency(f, args...);
+ // BestGuess = result;
+ // write_to_stderr(cpp::to_string(result));
+ // write_to_stderr(cpp::string_view("\n"));
+ // }
+ tlog << "Macro Best Guess: " << BestGuess << '\n';
+ tlog << "Samples: " << Samples << '\n';
+ return BestGuess;
+};
+
+class Test {
+ Test *Next = nullptr;
+
+public:
+ virtual ~Test() {}
+ virtual void SetUp() {}
+ virtual void TearDown() {}
+
+ static int runTests();
+
+protected:
+ static void addTest(Test *);
+
+private:
+ virtual void Run() = 0;
+ virtual const char *getName() const = 0;
+
+ static Test *Start;
+ static Test *End;
+};
+
+template <typename F> class FunctionBenchmark : public Test {
+ F Func;
+ const char *Name;
+
+public:
+ FunctionBenchmark(F Func, char const *Name) : Func(Func), Name(Name) {
+ addTest(this);
+ }
+
+private:
+ void Run() override {
+ BenchmarkOptions Options;
+ auto latency = benchmark(Options, Func);
+ LIBC_NAMESPACE::libc_gpu_benchmarks::tlog << "FnName: " << Name << '\n';
+ LIBC_NAMESPACE::libc_gpu_benchmarks::tlog << "FnBenchmark: " << latency
+ << '\n';
+ }
+ const char *getName() const override { return Name; }
+};
+
+class WrapperBenchmark : public Test {
+ using BenchmarkWrapperFunction = uint64_t (*)();
+ BenchmarkWrapperFunction Func;
+ const char *Name;
+
+public:
+ WrapperBenchmark(BenchmarkWrapperFunction Func, char const *Name)
+ : Func(Func), Name(Name) {
+ addTest(this);
+ }
+
+private:
+ void Run() override {
+ tlog << "Running wrapper: " << Name << '\n';
+ // for (int i = 0; i < 10; i++) {
+ // auto overhead = LIBC_NAMESPACE::overhead();
+ // auto result = Func() - overhead;
+ // tlog << "Result: " << result << '\n';
+ // tlog << "Overhead: " << overhead << '\n';
+ // }
+ BenchmarkOptions Options;
+ auto latency = benchmark_wrapper(Options, Func);
+ tlog << "FnName: " << Name << '\n';
+ tlog << "FnBenchmark: " << latency << '\n';
+ }
+ const char *getName() const override { return Name; }
+};
+
+} // namespace libc_gpu_benchmarks
+
+} // namespace LIBC_NAMESPACE
+
+// #define BENCHMARK(SuiteName, TestName) \
+// class SuiteName##_##TestName \
+// : public LIBC_NAMESPACE::libc_gpu_benchmarks::Test { \
+// public: \
+// SuiteName##_##TestName() { addTest(this); } \
+// void Run() override; \
+// const char *getName() const override { return #SuiteName "." #TestName; } \
+// }; \
+// SuiteName##_##TestName SuiteName##_##TestName##_Instance; \
+// void SuiteName##_##TestName::Run()
+
+#define BENCHMARK_SINGLE_INPUT_OUTPUT(SuiteName, TestName, Func, Arg) \
+ class SuiteName##_##TestName \
+ : public LIBC_NAMESPACE::libc_gpu_benchmarks::Test { \
+ public: \
+ SuiteName##_##TestName() { addTest(this); } \
+ void Run() override; \
+ const char *getName() const override { return #SuiteName "." #TestName; } \
+ }; \
+ SuiteName##_##TestName SuiteName##_##TestName##_Instance; \
+ void SuiteName##_##TestName::Run() { \
+ LIBC_NAMESPACE::libc_gpu_benchmarks::BenchmarkOptions Options; \
+ LIBC_NAMESPACE::libc_gpu_benchmarks::benchmark(Options, &Func, Arg); \
+ }
+
+#define BENCHMARK_FN(SuiteName, TestName, Func) \
+ LIBC_NAMESPACE::libc_gpu_benchmarks::FunctionBenchmark \
+ SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
+
+#define BENCHMARK_WRAPPER(SuiteName, TestName, Func) \
+ LIBC_NAMESPACE::libc_gpu_benchmarks::WrapperBenchmark \
+ SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
+
+#define BENCHMARK_S_I_O_V2(SuiteName, TestName, Func, Arg) \
+ class SuiteName##_##TestName \
+ : public LIBC_NAMESPACE::libc_gpu_benchmarks::Test { \
+ public: \
+ SuiteName##_##TestName() { addTest(this); } \
+ void Run() override; \
+ const char *getName() const override { return #SuiteName "." #TestName; } \
+ }; \
+ SuiteName##_##TestName SuiteName##_##TestName##_Instance; \
+ void SuiteName##_##TestName::Run() { \
+ LIBC_NAMESPACE::libc_gpu_benchmarks::BenchmarkOptions Options; \
+ LIBC_NAMESPACE::libc_gpu_benchmarks::benchmark_macro(Options, &Func, Arg); \
+ }
+
+#endif
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
new file mode 100644
index 0000000000000..e7e4f08f5af68
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
@@ -0,0 +1,6 @@
+#include "LibcGpuBenchmark.h"
+
+extern "C" int main(int argc, char **argv, char **envp) {
+ LIBC_NAMESPACE::libc_gpu_benchmarks::Test::runTests();
+ return 0;
+}
diff --git a/libc/benchmarks/gpu/TestLogger.cpp b/libc/benchmarks/gpu/TestLogger.cpp
new file mode 100644
index 0000000000000..b3a8399a91adb
--- /dev/null
+++ b/libc/benchmarks/gpu/TestLogger.cpp
@@ -0,0 +1,89 @@
+#include "benchmarks/gpu/TestLogger.h"
+#include "src/__support/CPP/string.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/io.h" // write_to_stderr
+#include "src/__support/big_int.h" // is_big_int
+#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
+#include "src/__support/uint128.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+namespace libc_gpu_benchmarks {
+
+// cpp::string_view specialization
+template <>
+TestLogger &TestLogger::operator<< <cpp::string_view>(cpp::string_view str) {
+ LIBC_NAMESPACE::write_to_stderr(str);
+ return *this;
+}
+
+// cpp::string specialization
+template <> TestLogger &TestLogger::operator<< <cpp::string>(cpp::string str) {
+ return *this << static_cast<cpp::string_view>(str);
+}
+
+// const char* specialization
+template <> TestLogger &TestLogger::operator<< <const char *>(const char *str) {
+ return *this << cpp::string_view(str);
+}
+
+// char* specialization
+template <> TestLogger &TestLogger::operator<< <char *>(char *str) {
+ return *this << cpp::string_view(str);
+}
+
+// char specialization
+template <> TestLogger &TestLogger::operator<<(char ch) {
+ return *this << cpp::string_view(&ch, 1);
+}
+
+// bool specialization
+template <> TestLogger &TestLogger::operator<<(bool cond) {
+ return *this << (cond ? "true" : "false");
+}
+
+// void * specialization
+template <> TestLogger &TestLogger::operator<<(void *addr) {
+ return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr));
+}
+
+template <typename T> TestLogger &TestLogger::operator<<(T t) {
+ if constexpr (is_big_int_v<T> ||
+ (cpp::is_integral_v<T> && cpp::is_unsigned_v<T> &&
+ (sizeof(T) > sizeof(uint64_t)))) {
+ static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt");
+ const IntegerToString<T, radix::Hex::WithPrefix> buffer(t);
+ return *this << buffer.view();
+ } else {
+ return *this << cpp::to_string(t);
+ }
+}
+
+// is_integral specializations
+// char is already specialized to handle character
+template TestLogger &TestLogger::operator<< <short>(short);
+template TestLogger &TestLogger::operator<< <int>(int);
+template TestLogger &TestLogger::operator<< <long>(long);
+template TestLogger &TestLogger::operator<< <long long>(long long);
+template TestLogger &TestLogger::operator<< <unsigned char>(unsigned char);
+template TestLogger &TestLogger::operator<< <unsigned short>(unsigned short);
+template TestLogger &TestLogger::operator<< <unsigned int>(unsigned int);
+template TestLogger &TestLogger::operator<< <unsigned long>(unsigned long);
+template TestLogger &
+ TestLogger::operator<< <unsigned long long>(unsigned long long);
+
+#ifdef LIBC_TYPES_HAS_INT128
+template TestLogger &TestLogger::operator<< <__uint128_t>(__uint128_t);
+#endif // LIBC_TYPES_HAS_INT128
+template TestLogger &TestLogger::operator<< <UInt<128>>(UInt<128>);
+template TestLogger &TestLogger::operator<< <UInt<192>>(UInt<192>);
+template TestLogger &TestLogger::operator<< <UInt<256>>(UInt<256>);
+template TestLogger &TestLogger::operator<< <UInt<320>>(UInt<320>);
+
+// TODO: Add floating point formatting once it's supported by StringStream.
+
+TestLogger tlog;
+
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/TestLogger.h b/libc/benchmarks/gpu/TestLogger.h
new file mode 100644
index 0000000000000..68690ce0c3940
--- /dev/null
+++ b/libc/benchmarks/gpu/TestLogger.h
@@ -0,0 +1,27 @@
+//===-- Utilities to log to standard output during tests --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_UNITTEST_TESTLOGGER_H
+#define LLVM_LIBC_TEST_UNITTEST_TESTLOGGER_H
+
+namespace LIBC_NAMESPACE {
+namespace libc_gpu_benchmarks {
+
+// A class to log to standard output in the context of hermetic tests.
+struct TestLogger {
+ constexpr TestLogger() = default;
+ template <typename T> TestLogger &operator<<(T);
+};
+
+// A global TestLogger instance to be used in tests.
+extern TestLogger tlog;
+
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
+
+#endif /* LLVM_LIBC_TEST_UNITTEST_TESTLOGGER_H */
diff --git a/libc/benchmarks/gpu/dummy.cpp b/libc/benchmarks/gpu/dummy.cpp
new file mode 100644
index 0000000000000..6d15f98fca220
--- /dev/null
+++ b/libc/benchmarks/gpu/dummy.cpp
@@ -0,0 +1,42 @@
+#include "LibcGpuBenchmark.h"
+#include "timing/timing.h"
+
+#include "src/stdio/fputs.h"
+
+int add_test(int x) {
+ return x + 1;
+}
+
+__attribute__((noinline)) [[gnu::noinline]] int function_call_overhead(int x) {
+ asm volatile ("");
+ return x;
+}
+
+// void DummyHiBenchmark() {
+// LIBC_NAMESPACE::fputs("Hi\n", stderr);
+// }
+// BENCHMARK_FN(Dummy, DummyHiBenchmark, DummyHiBenchmark);
+
+// void DummyV2Benchmark() {
+// int result = dummy_hi(10);
+// asm volatile("" :: "r"(result));
+// auto test_cycles = LIBC_NAMESPACE::latency(dummy_hi, 10);
+// LIBC_NAMESPACE::libc_gpu_benchmarks::tlog << "In func: " << test_cycles << '\n';
+// }
+// BENCHMARK_FN(Dummy, DummyV2Benchmark, DummyV2Benchmark);
+
+// BENCHMARK_SINGLE_INPUT_OUTPUT(Dummy, DummySingleInputOutput, dummy_hi, 10);
+
+// BENCHMARK_S_I_O_V2(Dummy, DummySIOMacro, dummy_hi, 10);
+
+// uint64_t DummyWrapperBenchmark() {
+// int x = 10;
+// return LIBC_NAMESPACE::latency(add_test, x);
+// }
+// BENCHMARK_WRAPPER(Dummy, DummyWrapperBenchmark, DummyWrapperBenchmark);
+
+uint64_t DummyFunctionCallOverhead() {
+ int x = 10;
+ return LIBC_NAMESPACE::latency(function_call_overhead, x);
+}
+BENCHMARK_WRAPPER(Dummy, DummyFunctionCallOverhead, DummyFunctionCallOverhead);
diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt
new file mode 100644
index 0000000000000..f15d082e4dd2b
--- /dev/null
+++ b/libc/benchmarks/gpu/src/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(ctype)
+add_subdirectory(math)
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
new file mode 100644
index 0000000000000..ab2f6cdf0c7fd
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_custom_target(libc-gpu-ctype-benchmarks)
+
+add_gpu_benchmark(
+ isalnum_benchmark
+ SUITE
+ libc-gpu-ctype-benchmarks
+ SRCS
+ isalnum_benchmark.cpp
+ DEPENDS
+ libc.src.ctype.isalnum
+)
+
+add_gpu_benchmark(
+ isalpha_benchmark
+ SUITE
+ libc-gpu-ctype-benchmarks
+ SRCS
+ isalpha_benchmark.cpp
+ DEPENDS
+ libc.src.ctype.isalpha
+)
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
new file mode 100644
index 0000000000000..071675bb887b6
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -0,0 +1,24 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/ctype/isalnum.h"
+
+// BENCHMARK_SINGLE_INPUT_OUTPUT(LlvmLibcIsAlNumGpuBenchmark,
+// IsAlnumSingleInputOutput,
+// LIBC_NAMESPACE::isalnum, 'c');
+
+// void BM_IsAlnumBasic() { bool isAlpha = LIBC_NAMESPACE::isalnum('c'); }
+// BENCHMARK_FN(LlvmLibcIsAlNumGpuBenchmark, IsAlnumC, BM_IsAlnumBasic);
+
+uint64_t BM_IsAlnumWrapper() {
+ char x = 'c';
+ return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
+}
+BENCHMARK_WRAPPER(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper,
+ BM_IsAlnumWrapper);
+
+uint64_t BM_IsAlnumWithOverhead() {
+ char x = 'c';
+ return LIBC_NAMESPACE::function_call_latency(LIBC_NAMESPACE::isalnum, x);
+}
+BENCHMARK_WRAPPER(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWithOverhead,
+ BM_IsAlnumWithOverhead);
diff --git a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
new file mode 100644
index 0000000000000..e432b1cf72c5f
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
@@ -0,0 +1,9 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/ctype/isalpha.h"
+
+uint64_t BM_IsAlpha() {
+ char x = 'c';
+ return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
+}
+BENCHMARK_WRAPPER(LlvmLibcIsAlphaGpuBenchmark, IsAlpha, BM_IsAlpha);
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/libc/benchmarks/gpu/timing/CMakeLists.txt b/libc/benchmarks/gpu/timing/CMakeLists.txt
new file mode 100644
index 0000000000000..4a65b25005bab
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/CMakeLists.txt
@@ -0,0 +1,20 @@
+message(STATUS "In GPU subdir")
+# if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+# message(STATUS "Target arch is NOT gpu")
+# return()
+# endif()
+
+foreach(target nvptx)
+ add_subdirectory(${target})
+ list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing)
+endforeach()
+
+message(STATUS "GPU TIMING: ${target_gpu_timing}")
+
+add_header_library(
+ timing
+ HDRS
+ timing.h
+ DEPENDS
+ ${target_gpu_timing}
+)
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
new file mode 100644
index 0000000000000..7343ab7791197
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -0,0 +1,8 @@
+message(STATUS "IN nvptx dir")
+add_header_library(
+ nvptx_timing
+ HDRS
+ timing.h
+ DEPENDS
+ libc.src.__support.common
+)
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
new file mode 100644
index 0000000000000..39c23d596b7f3
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -0,0 +1,175 @@
+//===------------- NVPTX implementation of timing utils ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+
+// Returns the overhead associated with calling the profiling region. This
+// allows us to substract the constant-time overhead from the latency to
+// obtain a true result. This can vary with system load.
+[[gnu::noinline]] static uint64_t overhead() {
+ volatile uint32_t x = 1;
+ uint32_t y = x;
+ gpu::sync_threads();
+ uint64_t start = gpu::processor_clock();
+ asm volatile("" ::"r"(y), "llr"(start));
+ uint32_t result = y;
+ asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+ uint64_t stop = gpu::processor_clock();
+ gpu::sync_threads();
+ volatile auto storage = result;
+ return stop - start;
+}
+
+// Stimulate a simple function and obtain its latency in clock cycles on the
+// system. This function cannot be inlined or else it will disturb the very
+// deliccate balance of hard-coded dependencies.
+//
+// FIXME: This does not work in general on NVPTX because of further
+// optimizations ptxas performs. The only way to get consistent results is to
+// pass and extra "SHELL:-Xcuda-ptxas -O0" to CMake's compiler flag. This
+// negatively implacts performance but it is at least stable.
+template <typename F, typename T>
+[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
+ // We need to store the input somewhere to guarantee that the compiler will
+ // not constant propagate it and remove the profiling region.
+ volatile T storage = t;
+ T arg = storage;
+ asm volatile("" ::"r"(arg));
+
+ // Get the current timestamp from the clock.
+ gpu::sync_threads();
+ __nvvm_membar_sys();
+ uint64_t start = gpu::processor_clock();
+
+ // This forces the compiler to load the input argument and run the clock cycle
+ // counter before the profiling region.
+ asm volatile("" ::"r"(arg), "llr"(start));
+
+ // Run the function under test and return its value.
+ auto result = f(arg);
+
+ // This inline assembly performs a no-op which forces the result to both be
+ // used and prevents us from exiting this region before it's complete.
+ asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+
+ // Obtain the current timestamp after running the calculation and force
+ // ordering.
+ uint64_t stop = gpu::processor_clock();
+ __nvvm_membar_sys();
+ gpu::sync_threads();
+ asm volatile("" ::"r"(stop));
+ volatile T output = result;
+
+ // Return the time elapsed.
+ return stop - start;
+}
+
+[[gnu::noinline]] static uint64_t single_input_function(int x) {
+ asm volatile("" :: "r"(x)); // prevent the compiler from optimizing out x
+ return x;
+}
+
+template <typename F, typename T>
+static LIBC_INLINE uint64_t function_call_latency(F f, T t) {
+ auto function_call_overhead = latency(single_input_function, 0);
+ return latency(f, t) - function_call_overhead;
+}
+
+static LIBC_INLINE uint64_t latency(void (*f)()) {
+ // Get the current timestamp from the clock.
+ gpu::sync_threads();
+ uint64_t start = gpu::processor_clock();
+
+ // This forces the compiler to load the input argument and run the clock cycle
+ // counter before the profiling region.
+ asm volatile("" ::"llr"(start));
+
+ // Run the function under test and return its value.
+ f();
+
+ // Obtain the current timestamp after running the calculation and force
+ // ordering.
+ uint64_t stop = gpu::processor_clock();
+ gpu::sync_threads();
+
+ // Return the time elapsed.
+ return stop - start;
+}
+
+template <typename F, typename T1, typename T2>
+static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
+ volatile T1 storage = t1;
+ volatile T2 storage2 = t2;
+ T1 arg = storage;
+ T2 arg2 = storage2;
+ asm volatile("" ::"r"(arg), "r"(arg2));
+
+ gpu::sync_threads();
+ uint64_t start = gpu::processor_clock();
+
+ asm volatile("" ::"r"(arg), "r"(arg2), "llr"(start));
+
+ auto result = f(arg, arg2);
+
+ asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+
+ uint64_t stop = gpu::processor_clock();
+ gpu::sync_threads();
+ asm volatile("" ::"r"(stop));
+ volatile auto output = result;
+
+ return stop - start;
+}
+
+} // namespace LIBC_NAMESPACE
+
+/**
+ * LatencyP must be a pointer to a uint64_t holding the result variable
+ */
+#define SINGLE_INPUT_OUTPUT_LATENCY(Func, t, LatencyP) \
+ do { \
+ tlog << "Latency: " << ((long)(*LatencyP)) << '\n'; \
+ *LatencyP = 200; \
+ volatile auto storage = t; \
+ auto arg = storage; \
+ asm volatile("" ::"r"(arg), "r"(LatencyP)); \
+ \
+ LIBC_NAMESPACE::gpu::sync_threads(); \
+ uint64_t start = LIBC_NAMESPACE::gpu::processor_clock(); \
+ \
+ asm volatile("" ::"r"(arg), "llr"(start)); \
+ auto result = Func(arg); \
+ asm volatile("" ::"r"(LatencyP)); \
+ *LatencyP = 312; \
+ asm volatile("" ::"r"(LatencyP)); \
+ asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); \
+ asm volatile("" ::"r"(LatencyP)); \
+ *LatencyP = 499; \
+ \
+ uint64_t stop = gpu::processor_clock(); \
+ gpu::sync_threads(); \
+ volatile auto output = result; \
+ \
+ tlog << "Start: " << start << '\n'; \
+ tlog << "Stop: " << stop << '\n'; \
+ tlog << "Diff: " << (stop - start) << '\n'; \
+ asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(LatencyP) :); \
+ *LatencyP = stop - start; \
+ } while (0)
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
diff --git a/libc/benchmarks/gpu/timing/timing.h b/libc/benchmarks/gpu/timing/timing.h
new file mode 100644
index 0000000000000..45201e56964e6
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/timing.h
@@ -0,0 +1,22 @@
+//===------------- Implementation of GPU timing utils -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_H
+#define LLVM_LIBC_UTILS_GPU_TIMING_H
+
+#include "src/__support/macros/properties/architectures.h"
+
+#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+#error "amdgpu not yet supported
+#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
+#include "nvptx/timing.h"
+#else
+#error "unsupported platform"
+#endif
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_H
\ No newline at end of file
>From 89c13395da99a54d00ecc276c329e1e15a20406d Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 12 May 2024 13:45:10 -0400
Subject: [PATCH 2/3] clean up
---
libc/CMakeLists.txt | 2 -
libc/benchmarks/gpu/BenchmarkLogger.cpp | 89 ++++++++++++
.../gpu/{TestLogger.h => BenchmarkLogger.h} | 14 +-
libc/benchmarks/gpu/CMakeLists.txt | 11 +-
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 22 +--
libc/benchmarks/gpu/LibcGpuBenchmark.h | 128 +++---------------
libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp | 2 +-
libc/benchmarks/gpu/TestLogger.cpp | 89 ------------
libc/benchmarks/gpu/dummy.cpp | 41 ++----
libc/benchmarks/gpu/timing/CMakeLists.txt | 8 --
.../gpu/timing/nvptx/CMakeLists.txt | 1 -
11 files changed, 137 insertions(+), 270 deletions(-)
create mode 100644 libc/benchmarks/gpu/BenchmarkLogger.cpp
rename libc/benchmarks/gpu/{TestLogger.h => BenchmarkLogger.h} (66%)
delete mode 100644 libc/benchmarks/gpu/TestLogger.cpp
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index ba6eadd207a4f..175efd89d67e6 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -400,9 +400,7 @@ if(LLVM_INCLUDE_TESTS)
add_subdirectory(fuzzing)
endif()
-message(STATUS "Checking on variable: ${LIBC_INCLUDE_BENCHMARKS}")
if(LIBC_INCLUDE_BENCHMARKS)
- message(STATUS "including libc benchmarks")
add_subdirectory(benchmarks)
endif()
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
new file mode 100644
index 0000000000000..94a0d897c9585
--- /dev/null
+++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp
@@ -0,0 +1,89 @@
+#include "benchmarks/gpu/BenchmarkLogger.h"
+#include "src/__support/CPP/string.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/io.h" // write_to_stderr
+#include "src/__support/big_int.h" // is_big_int
+#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
+#include "src/__support/uint128.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+namespace libc_gpu_benchmarks {
+
+// cpp::string_view specialization
+template <>
+BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) {
+ LIBC_NAMESPACE::write_to_stderr(str);
+ return *this;
+}
+
+// cpp::string specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) {
+ return *this << static_cast<cpp::string_view>(str);
+}
+
+// const char* specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) {
+ return *this << cpp::string_view(str);
+}
+
+// char* specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<< <char *>(char *str) {
+ return *this << cpp::string_view(str);
+}
+
+// char specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) {
+ return *this << cpp::string_view(&ch, 1);
+}
+
+// bool specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) {
+ return *this << (cond ? "true" : "false");
+}
+
+// void * specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) {
+ return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr));
+}
+
+template <typename T> BenchmarkLogger &BenchmarkLogger::operator<<(T t) {
+ if constexpr (is_big_int_v<T> ||
+ (cpp::is_integral_v<T> && cpp::is_unsigned_v<T> &&
+ (sizeof(T) > sizeof(uint64_t)))) {
+ static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt");
+ const IntegerToString<T, radix::Hex::WithPrefix> buffer(t);
+ return *this << buffer.view();
+ } else {
+ return *this << cpp::to_string(t);
+ }
+}
+
+// is_integral specializations
+// char is already specialized to handle character
+template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short);
+template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int);
+template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long);
+template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned char>(unsigned char);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned short>(unsigned short);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned int>(unsigned int);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned long>(unsigned long);
+template BenchmarkLogger &
+ BenchmarkLogger::operator<< <unsigned long long>(unsigned long long);
+
+#ifdef LIBC_TYPES_HAS_INT128
+template BenchmarkLogger &BenchmarkLogger::operator<< <__uint128_t>(__uint128_t);
+#endif // LIBC_TYPES_HAS_INT128
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<256>>(UInt<256>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>);
+
+// TODO: Add floating point formatting once it's supported by StringStream.
+
+BenchmarkLogger blog;
+
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/TestLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h
similarity index 66%
rename from libc/benchmarks/gpu/TestLogger.h
rename to libc/benchmarks/gpu/BenchmarkLogger.h
index 68690ce0c3940..ed3cc97e59c6d 100644
--- a/libc/benchmarks/gpu/TestLogger.h
+++ b/libc/benchmarks/gpu/BenchmarkLogger.h
@@ -6,22 +6,22 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIBC_TEST_UNITTEST_TESTLOGGER_H
-#define LLVM_LIBC_TEST_UNITTEST_TESTLOGGER_H
+#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
+#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
namespace LIBC_NAMESPACE {
namespace libc_gpu_benchmarks {
// A class to log to standard output in the context of hermetic tests.
-struct TestLogger {
- constexpr TestLogger() = default;
- template <typename T> TestLogger &operator<<(T);
+struct BenchmarkLogger {
+ constexpr BenchmarkLogger() = default;
+ template <typename T> BenchmarkLogger &operator<<(T);
};
// A global TestLogger instance to be used in tests.
-extern TestLogger tlog;
+extern BenchmarkLogger blog;
} // namespace libc_gpu_benchmarks
} // namespace LIBC_NAMESPACE
-#endif /* LLVM_LIBC_TEST_UNITTEST_TESTLOGGER_H */
+#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 2f258da8a3297..5920c5b5e5dfc 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -26,7 +26,6 @@ function (add_gpu_benchmark test_name)
get_fq_target_name(${test_name} fq_target_name)
get_fq_target_name(${test_name}.libc fq_libc_target_name) # Stores the compiled libc + infrastructure archive to link in
get_fq_deps_list(fq_deps_list ${GPU_BENCHMARK_DEPENDS})
- message(STATUS "Depends: ${fq_deps_list}")
list(APPEND fq_deps_list
# Hermetic tests use the platform's startup object. So, their deps also
# have to be collected.
@@ -52,12 +51,10 @@ function (add_gpu_benchmark test_name)
if(LIBC_CMAKE_VERBOSE_LOGGING)
set(msg "Skipping hermetic test ${fq_target_name} as it has missing deps: "
"${skipped_entrypoints_list}.")
- message(STATUS ${msg})
endif()
return()
endif()
list(REMOVE_DUPLICATES link_object_files)
- message(STATUS ${link_object_files})
# Make a library of all deps
add_library(
@@ -88,7 +85,6 @@ function (add_gpu_benchmark test_name)
target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
_get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
- message(STATUS "adding compile: ${compile_options}")
target_compile_options(${fq_build_target_name} PRIVATE ${compile_options} -save-temps)
set(link_libraries "")
@@ -100,8 +96,6 @@ function (add_gpu_benchmark test_name)
endif()
endforeach()
- message(STATUS "IS nvptx: ${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}")
-
if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
target_link_options(${fq_build_target_name} PRIVATE
${LIBC_COMPILE_OPTIONS_DEFAULT}
@@ -116,7 +110,6 @@ function (add_gpu_benchmark test_name)
"-Wl,--suppress-stack-size-warning"
-march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
"--cuda-path=${LIBC_CUDA_ROOT}")
- message(STATUS "ARCH: ${LIBC_GPU_TARGET_ARCHITECTURE}")
elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
else()
@@ -170,10 +163,10 @@ add_unittest_framework_library(
SRCS
LibcGpuBenchmark.cpp
LibcGpuBenchmarkMain.cpp
- TestLogger.cpp
+ BenchmarkLogger.cpp
HDRS
LibcGpuBenchmark.h
- TestLogger.h
+ BenchmarkLogger.h
DEPENDS
libc.src.__support.big_int
libc.src.__support.c_string
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 46e8f904e0643..f46f4a08362d8 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -10,24 +10,24 @@
namespace LIBC_NAMESPACE {
namespace libc_gpu_benchmarks {
-Test *Test::Start = nullptr;
-Test *Test::End = nullptr;
+Benchmark *Benchmark::Start = nullptr;
+Benchmark *Benchmark::End = nullptr;
-void Test::addTest(Test *T) {
+void Benchmark::addBenchmark(Benchmark *B) {
if (End == nullptr) {
- Start = T;
- End = T;
+ Start = B;
+ End = B;
return;
}
- End->Next = T;
- End = T;
+ End->Next = B;
+ End = B;
}
-int Test::runTests() {
- for (Test *T = Start; T != nullptr; T = T->Next) {
- tlog << T->getName() << "\n";
- T->Run();
+int Benchmark::runBenchmarks() {
+ for (Benchmark *B = Start; B != nullptr; B = B->Next) {
+ tlog << B->getName() << "\n";
+ B->Run();
}
return 0;
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 12695c4d18684..c9e4014808dc0 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -3,7 +3,7 @@
#include "benchmarks/gpu/timing/timing.h"
-#include "benchmarks/gpu/TestLogger.h"
+#include "benchmarks/gpu/BenchmarkLogger.h"
#include "src/__support/CPP/string.h"
#include "src/__support/CPP/string_view.h"
#include "src/__support/OSUtil/io.h"
@@ -100,107 +100,57 @@ uint64_t benchmark(const BenchmarkOptions &Options, F f, Args... args) {
Iterations *= Options.ScalingFactor;
}
- // for (int i = 0; i < 3; i++) {
- // uint64_t result = latency(f, args...);
- // BestGuess = result;
- // write_to_stderr(cpp::to_string(result));
- // write_to_stderr(cpp::string_view("\n"));
- // }
tlog << "Best Guess: " << BestGuess << '\n';
tlog << "Samples: " << Samples << '\n';
+ tlog << "\n";
return BestGuess;
};
uint64_t benchmark_wrapper(const BenchmarkOptions &Options,
uint64_t (*WrapperFunc)());
-template <typename F, typename Arg>
-uint64_t benchmark_macro(const BenchmarkOptions &Options, F f, Arg arg) {
- RuntimeEstimationProgression REP;
- size_t Iterations = Options.InitialIterations;
- if (Iterations < (uint32_t)1) {
- Iterations = 1;
- }
- size_t Samples = 0;
- uint64_t BestGuess = 0;
- uint64_t TotalCycles = 0;
- for (;;) {
- uint64_t SampleCycles = 0;
- for (uint32_t i = 0; i < Iterations; i++) {
- uint64_t result = 0;
- SINGLE_INPUT_OUTPUT_LATENCY(f, arg, &result);
- SampleCycles += result;
- tlog << "Macro: " << result << '\n';
- }
-
- Samples++;
- TotalCycles += SampleCycles;
- const double ChangeRatio =
- REP.ComputeImprovement({Iterations, SampleCycles});
- BestGuess = REP.CurrentEstimation;
-
- if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
- break;
- } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
- tlog << "Samples are stable!\n";
- break;
- }
-
- Iterations *= Options.ScalingFactor;
- }
- // for (int i = 0; i < 3; i++) {
- // uint64_t result = latency(f, args...);
- // BestGuess = result;
- // write_to_stderr(cpp::to_string(result));
- // write_to_stderr(cpp::string_view("\n"));
- // }
- tlog << "Macro Best Guess: " << BestGuess << '\n';
- tlog << "Samples: " << Samples << '\n';
- return BestGuess;
-};
-
-class Test {
- Test *Next = nullptr;
+class Benchmark {
+ Benchmark *Next = nullptr;
public:
- virtual ~Test() {}
+ virtual ~Benchmark() {}
virtual void SetUp() {}
virtual void TearDown() {}
- static int runTests();
+ static int runBenchmarks();
protected:
- static void addTest(Test *);
+ static void addBenchmark(Benchmark *);
private:
virtual void Run() = 0;
virtual const char *getName() const = 0;
- static Test *Start;
- static Test *End;
+ static Benchmark *Start;
+ static Benchmark *End;
};
-template <typename F> class FunctionBenchmark : public Test {
+template <typename F> class FunctionBenchmark : public Benchmark {
F Func;
const char *Name;
public:
FunctionBenchmark(F Func, char const *Name) : Func(Func), Name(Name) {
- addTest(this);
+ addBenchmark(this);
}
private:
void Run() override {
BenchmarkOptions Options;
auto latency = benchmark(Options, Func);
- LIBC_NAMESPACE::libc_gpu_benchmarks::tlog << "FnName: " << Name << '\n';
- LIBC_NAMESPACE::libc_gpu_benchmarks::tlog << "FnBenchmark: " << latency
- << '\n';
+ tlog << "FnName: " << Name << '\n';
+ tlog << "FnBenchmark: " << latency << '\n';
+ tlog << "\n";
}
const char *getName() const override { return Name; }
};
-class WrapperBenchmark : public Test {
+class WrapperBenchmark : public Benchmark {
using BenchmarkWrapperFunction = uint64_t (*)();
BenchmarkWrapperFunction Func;
const char *Name;
@@ -208,22 +158,17 @@ class WrapperBenchmark : public Test {
public:
WrapperBenchmark(BenchmarkWrapperFunction Func, char const *Name)
: Func(Func), Name(Name) {
- addTest(this);
+ addBenchmark(this);
}
private:
void Run() override {
tlog << "Running wrapper: " << Name << '\n';
- // for (int i = 0; i < 10; i++) {
- // auto overhead = LIBC_NAMESPACE::overhead();
- // auto result = Func() - overhead;
- // tlog << "Result: " << result << '\n';
- // tlog << "Overhead: " << overhead << '\n';
- // }
BenchmarkOptions Options;
auto latency = benchmark_wrapper(Options, Func);
tlog << "FnName: " << Name << '\n';
tlog << "FnBenchmark: " << latency << '\n';
+ tlog << "\n";
}
const char *getName() const override { return Name; }
};
@@ -232,31 +177,6 @@ class WrapperBenchmark : public Test {
} // namespace LIBC_NAMESPACE
-// #define BENCHMARK(SuiteName, TestName) \
-// class SuiteName##_##TestName \
-// : public LIBC_NAMESPACE::libc_gpu_benchmarks::Test { \
-// public: \
-// SuiteName##_##TestName() { addTest(this); } \
-// void Run() override; \
-// const char *getName() const override { return #SuiteName "." #TestName; } \
-// }; \
-// SuiteName##_##TestName SuiteName##_##TestName##_Instance; \
-// void SuiteName##_##TestName::Run()
-
-#define BENCHMARK_SINGLE_INPUT_OUTPUT(SuiteName, TestName, Func, Arg) \
- class SuiteName##_##TestName \
- : public LIBC_NAMESPACE::libc_gpu_benchmarks::Test { \
- public: \
- SuiteName##_##TestName() { addTest(this); } \
- void Run() override; \
- const char *getName() const override { return #SuiteName "." #TestName; } \
- }; \
- SuiteName##_##TestName SuiteName##_##TestName##_Instance; \
- void SuiteName##_##TestName::Run() { \
- LIBC_NAMESPACE::libc_gpu_benchmarks::BenchmarkOptions Options; \
- LIBC_NAMESPACE::libc_gpu_benchmarks::benchmark(Options, &Func, Arg); \
- }
-
#define BENCHMARK_FN(SuiteName, TestName, Func) \
LIBC_NAMESPACE::libc_gpu_benchmarks::FunctionBenchmark \
SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
@@ -265,18 +185,4 @@ class WrapperBenchmark : public Test {
LIBC_NAMESPACE::libc_gpu_benchmarks::WrapperBenchmark \
SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
-#define BENCHMARK_S_I_O_V2(SuiteName, TestName, Func, Arg) \
- class SuiteName##_##TestName \
- : public LIBC_NAMESPACE::libc_gpu_benchmarks::Test { \
- public: \
- SuiteName##_##TestName() { addTest(this); } \
- void Run() override; \
- const char *getName() const override { return #SuiteName "." #TestName; } \
- }; \
- SuiteName##_##TestName SuiteName##_##TestName##_Instance; \
- void SuiteName##_##TestName::Run() { \
- LIBC_NAMESPACE::libc_gpu_benchmarks::BenchmarkOptions Options; \
- LIBC_NAMESPACE::libc_gpu_benchmarks::benchmark_macro(Options, &Func, Arg); \
- }
-
#endif
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
index e7e4f08f5af68..c971b00cc9a1b 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
@@ -1,6 +1,6 @@
#include "LibcGpuBenchmark.h"
extern "C" int main(int argc, char **argv, char **envp) {
- LIBC_NAMESPACE::libc_gpu_benchmarks::Test::runTests();
+ LIBC_NAMESPACE::libc_gpu_benchmarks::Benchmark::runBenchmarks();
return 0;
}
diff --git a/libc/benchmarks/gpu/TestLogger.cpp b/libc/benchmarks/gpu/TestLogger.cpp
deleted file mode 100644
index b3a8399a91adb..0000000000000
--- a/libc/benchmarks/gpu/TestLogger.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-#include "benchmarks/gpu/TestLogger.h"
-#include "src/__support/CPP/string.h"
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/OSUtil/io.h" // write_to_stderr
-#include "src/__support/big_int.h" // is_big_int
-#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
-#include "src/__support/uint128.h"
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE {
-namespace libc_gpu_benchmarks {
-
-// cpp::string_view specialization
-template <>
-TestLogger &TestLogger::operator<< <cpp::string_view>(cpp::string_view str) {
- LIBC_NAMESPACE::write_to_stderr(str);
- return *this;
-}
-
-// cpp::string specialization
-template <> TestLogger &TestLogger::operator<< <cpp::string>(cpp::string str) {
- return *this << static_cast<cpp::string_view>(str);
-}
-
-// const char* specialization
-template <> TestLogger &TestLogger::operator<< <const char *>(const char *str) {
- return *this << cpp::string_view(str);
-}
-
-// char* specialization
-template <> TestLogger &TestLogger::operator<< <char *>(char *str) {
- return *this << cpp::string_view(str);
-}
-
-// char specialization
-template <> TestLogger &TestLogger::operator<<(char ch) {
- return *this << cpp::string_view(&ch, 1);
-}
-
-// bool specialization
-template <> TestLogger &TestLogger::operator<<(bool cond) {
- return *this << (cond ? "true" : "false");
-}
-
-// void * specialization
-template <> TestLogger &TestLogger::operator<<(void *addr) {
- return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr));
-}
-
-template <typename T> TestLogger &TestLogger::operator<<(T t) {
- if constexpr (is_big_int_v<T> ||
- (cpp::is_integral_v<T> && cpp::is_unsigned_v<T> &&
- (sizeof(T) > sizeof(uint64_t)))) {
- static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt");
- const IntegerToString<T, radix::Hex::WithPrefix> buffer(t);
- return *this << buffer.view();
- } else {
- return *this << cpp::to_string(t);
- }
-}
-
-// is_integral specializations
-// char is already specialized to handle character
-template TestLogger &TestLogger::operator<< <short>(short);
-template TestLogger &TestLogger::operator<< <int>(int);
-template TestLogger &TestLogger::operator<< <long>(long);
-template TestLogger &TestLogger::operator<< <long long>(long long);
-template TestLogger &TestLogger::operator<< <unsigned char>(unsigned char);
-template TestLogger &TestLogger::operator<< <unsigned short>(unsigned short);
-template TestLogger &TestLogger::operator<< <unsigned int>(unsigned int);
-template TestLogger &TestLogger::operator<< <unsigned long>(unsigned long);
-template TestLogger &
- TestLogger::operator<< <unsigned long long>(unsigned long long);
-
-#ifdef LIBC_TYPES_HAS_INT128
-template TestLogger &TestLogger::operator<< <__uint128_t>(__uint128_t);
-#endif // LIBC_TYPES_HAS_INT128
-template TestLogger &TestLogger::operator<< <UInt<128>>(UInt<128>);
-template TestLogger &TestLogger::operator<< <UInt<192>>(UInt<192>);
-template TestLogger &TestLogger::operator<< <UInt<256>>(UInt<256>);
-template TestLogger &TestLogger::operator<< <UInt<320>>(UInt<320>);
-
-// TODO: Add floating point formatting once it's supported by StringStream.
-
-TestLogger tlog;
-
-} // namespace libc_gpu_benchmarks
-} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/dummy.cpp b/libc/benchmarks/gpu/dummy.cpp
index 6d15f98fca220..fa9b220f33b23 100644
--- a/libc/benchmarks/gpu/dummy.cpp
+++ b/libc/benchmarks/gpu/dummy.cpp
@@ -1,42 +1,21 @@
#include "LibcGpuBenchmark.h"
#include "timing/timing.h"
-#include "src/stdio/fputs.h"
-
-int add_test(int x) {
- return x + 1;
-}
+int add_test(int x) { return x + 1; }
__attribute__((noinline)) [[gnu::noinline]] int function_call_overhead(int x) {
- asm volatile ("");
- return x;
+ asm volatile("");
+ return x;
}
-// void DummyHiBenchmark() {
-// LIBC_NAMESPACE::fputs("Hi\n", stderr);
-// }
-// BENCHMARK_FN(Dummy, DummyHiBenchmark, DummyHiBenchmark);
-
-// void DummyV2Benchmark() {
-// int result = dummy_hi(10);
-// asm volatile("" :: "r"(result));
-// auto test_cycles = LIBC_NAMESPACE::latency(dummy_hi, 10);
-// LIBC_NAMESPACE::libc_gpu_benchmarks::tlog << "In func: " << test_cycles << '\n';
-// }
-// BENCHMARK_FN(Dummy, DummyV2Benchmark, DummyV2Benchmark);
-
-// BENCHMARK_SINGLE_INPUT_OUTPUT(Dummy, DummySingleInputOutput, dummy_hi, 10);
-
-// BENCHMARK_S_I_O_V2(Dummy, DummySIOMacro, dummy_hi, 10);
-
-// uint64_t DummyWrapperBenchmark() {
-// int x = 10;
-// return LIBC_NAMESPACE::latency(add_test, x);
-// }
-// BENCHMARK_WRAPPER(Dummy, DummyWrapperBenchmark, DummyWrapperBenchmark);
+uint64_t DummyWrapperBenchmark() {
+ int x = 10;
+ return LIBC_NAMESPACE::latency(add_test, x);
+}
+BENCHMARK_WRAPPER(Dummy, DummyWrapperBenchmark, DummyWrapperBenchmark);
uint64_t DummyFunctionCallOverhead() {
- int x = 10;
- return LIBC_NAMESPACE::latency(function_call_overhead, x);
+ int x = 10;
+ return LIBC_NAMESPACE::latency(function_call_overhead, x);
}
BENCHMARK_WRAPPER(Dummy, DummyFunctionCallOverhead, DummyFunctionCallOverhead);
diff --git a/libc/benchmarks/gpu/timing/CMakeLists.txt b/libc/benchmarks/gpu/timing/CMakeLists.txt
index 4a65b25005bab..0e6a5a6b47968 100644
--- a/libc/benchmarks/gpu/timing/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/CMakeLists.txt
@@ -1,16 +1,8 @@
-message(STATUS "In GPU subdir")
-# if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
-# message(STATUS "Target arch is NOT gpu")
-# return()
-# endif()
-
foreach(target nvptx)
add_subdirectory(${target})
list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing)
endforeach()
-message(STATUS "GPU TIMING: ${target_gpu_timing}")
-
add_header_library(
timing
HDRS
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index 7343ab7791197..9958e16206a41 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -1,4 +1,3 @@
-message(STATUS "IN nvptx dir")
add_header_library(
nvptx_timing
HDRS
>From 9273e50a8876b502475aea1547bd84d47c32b55e Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 12 May 2024 16:44:06 -0400
Subject: [PATCH 3/3] clean up experimentation code
---
libc/benchmarks/CMakeLists.txt | 6 +-
libc/benchmarks/gpu/CMakeLists.txt | 15 +--
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 23 ++---
libc/benchmarks/gpu/LibcGpuBenchmark.h | 92 +++----------------
libc/benchmarks/gpu/dummy.cpp | 21 -----
.../gpu/src/ctype/isalnum_benchmark.cpp | 24 +++--
.../gpu/src/ctype/isalpha_benchmark.cpp | 2 +-
libc/benchmarks/gpu/timing/nvptx/timing.h | 67 --------------
libc/benchmarks/gpu/timing/timing.h | 3 +-
9 files changed, 40 insertions(+), 213 deletions(-)
delete mode 100644 libc/benchmarks/gpu/dummy.cpp
diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 1fb7026d79359..a802e653a091e 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -91,10 +91,10 @@ if(NOT LIBC_TARGET_OS_IS_GPU)
)
target_link_libraries(libc-benchmark
PUBLIC
- benchmark::benchmark
- LLVMSupport
+ benchmark::benchmark
+ LLVMSupport
LLVMTargetParser
- Threads::Threads
+ Threads::Threads
)
add_dependencies(libc-benchmark google-benchmark-libc)
llvm_update_compile_flags(libc-benchmark)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 5920c5b5e5dfc..a18be27e33573 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -85,7 +85,7 @@ function (add_gpu_benchmark test_name)
target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
_get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
- target_compile_options(${fq_build_target_name} PRIVATE ${compile_options} -save-temps)
+ target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
set(link_libraries "")
foreach(lib in LISTS GPU_BENCHMARK_LINK_LIBRARIES)
@@ -180,17 +180,4 @@ add_unittest_framework_library(
libc.benchmarks.gpu.timing.timing
)
-add_custom_target(dummy-suite)
-
-add_gpu_benchmark(
- dummy
- SUITE
- dummy-suite
- SRCS
- dummy.cpp
- DEPENDS
- libc.src.stdio.fputs
- libc.src.stdio.stderr
-)
-
add_subdirectory(src)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index f46f4a08362d8..d37f5a0a53a70 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -1,10 +1,3 @@
-
-#include "benchmarks/gpu/timing/timing.h"
-
-#include "src/__support/CPP/string.h"
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/OSUtil/io.h"
-
#include "LibcGpuBenchmark.h"
namespace LIBC_NAMESPACE {
@@ -26,16 +19,17 @@ void Benchmark::addBenchmark(Benchmark *B) {
int Benchmark::runBenchmarks() {
for (Benchmark *B = Start; B != nullptr; B = B->Next) {
- tlog << B->getName() << "\n";
B->Run();
}
return 0;
}
-uint64_t benchmark_wrapper(const BenchmarkOptions &Options,
- uint64_t (*WrapperFunc)()) {
+BenchmarkResult benchmark(const BenchmarkOptions &Options,
+ uint64_t (*WrapperFunc)()) {
+ BenchmarkResult Result;
RuntimeEstimationProgression REP;
+ size_t TotalIterations = 0;
size_t Iterations = Options.InitialIterations;
if (Iterations < (uint32_t)1) {
Iterations = 1;
@@ -53,6 +47,7 @@ uint64_t benchmark_wrapper(const BenchmarkOptions &Options,
Samples++;
TotalCycles += SampleCycles;
+ TotalIterations += Iterations;
const double ChangeRatio =
REP.ComputeImprovement({Iterations, SampleCycles});
BestGuess = REP.CurrentEstimation;
@@ -60,15 +55,15 @@ uint64_t benchmark_wrapper(const BenchmarkOptions &Options,
if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
break;
} else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
- tlog << "Samples are stable!\n";
break;
}
Iterations *= Options.ScalingFactor;
}
- tlog << "Best Guess: " << BestGuess << '\n';
- tlog << "Samples: " << Samples << '\n';
- return BestGuess;
+ Result.Cycles = BestGuess;
+ Result.Samples = Samples;
+ Result.TotalIterations = TotalIterations;
+ return Result;
};
} // namespace libc_gpu_benchmarks
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index c9e4014808dc0..ccbbe3629dbda 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -4,10 +4,8 @@
#include "benchmarks/gpu/timing/timing.h"
#include "benchmarks/gpu/BenchmarkLogger.h"
-#include "src/__support/CPP/string.h"
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/OSUtil/io.h"
+#include <stddef.h>
#include <stdint.h>
namespace LIBC_NAMESPACE {
@@ -61,53 +59,14 @@ class RuntimeEstimationProgression {
}
};
-template <typename F, typename... Args>
-uint64_t benchmark(const BenchmarkOptions &Options, F f, Args... args) {
- RuntimeEstimationProgression REP;
- size_t Iterations = Options.InitialIterations;
- if (Iterations < (uint32_t)1) {
- Iterations = 1;
- }
+struct BenchmarkResult {
+ uint64_t Cycles = 0;
size_t Samples = 0;
- uint64_t BestGuess = 0;
- uint64_t TotalCycles = 0;
-#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
- // Nvidia cannot perform LTO, so we need to perform
- // 1 call to "warm up" the function before microbenchmarking
- uint64_t result = latency(f, args...);
- tlog << "Running warm-up iteration: " << result << '\n';
-#endif
- for (;;) {
- uint64_t SampleCycles = 0;
- for (uint32_t i = 0; i < Iterations; i++) {
- uint64_t result = latency(f, args...);
- SampleCycles += result;
- tlog << result << '\n';
- }
-
- Samples++;
- TotalCycles += SampleCycles;
- const double ChangeRatio =
- REP.ComputeImprovement({Iterations, SampleCycles});
- BestGuess = REP.CurrentEstimation;
-
- if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
- break;
- } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
- tlog << "Samples are stable!\n";
- break;
- }
-
- Iterations *= Options.ScalingFactor;
- }
- tlog << "Best Guess: " << BestGuess << '\n';
- tlog << "Samples: " << Samples << '\n';
- tlog << "\n";
- return BestGuess;
+ size_t TotalIterations = 0;
};
-uint64_t benchmark_wrapper(const BenchmarkOptions &Options,
- uint64_t (*WrapperFunc)());
+BenchmarkResult benchmark(const BenchmarkOptions &Options,
+ uint64_t (*WrapperFunc)());
class Benchmark {
Benchmark *Next = nullptr;
@@ -130,26 +89,6 @@ class Benchmark {
static Benchmark *End;
};
-template <typename F> class FunctionBenchmark : public Benchmark {
- F Func;
- const char *Name;
-
-public:
- FunctionBenchmark(F Func, char const *Name) : Func(Func), Name(Name) {
- addBenchmark(this);
- }
-
-private:
- void Run() override {
- BenchmarkOptions Options;
- auto latency = benchmark(Options, Func);
- tlog << "FnName: " << Name << '\n';
- tlog << "FnBenchmark: " << latency << '\n';
- tlog << "\n";
- }
- const char *getName() const override { return Name; }
-};
-
class WrapperBenchmark : public Benchmark {
using BenchmarkWrapperFunction = uint64_t (*)();
BenchmarkWrapperFunction Func;
@@ -163,25 +102,20 @@ class WrapperBenchmark : public Benchmark {
private:
void Run() override {
- tlog << "Running wrapper: " << Name << '\n';
BenchmarkOptions Options;
- auto latency = benchmark_wrapper(Options, Func);
- tlog << "FnName: " << Name << '\n';
- tlog << "FnBenchmark: " << latency << '\n';
- tlog << "\n";
+ auto result = benchmark(Options, Func);
+ constexpr auto GREEN = "\033[32m";
+ constexpr auto RESET = "\033[0m";
+ blog << GREEN << "[ RUN ] " << RESET << Name << '\n';
+ blog << GREEN << "[ OK ] " << RESET << Name << ": " << result.Cycles
+ << " cycles, " << result.TotalIterations << " iterations\n";
}
const char *getName() const override { return Name; }
};
-
} // namespace libc_gpu_benchmarks
-
} // namespace LIBC_NAMESPACE
-#define BENCHMARK_FN(SuiteName, TestName, Func) \
- LIBC_NAMESPACE::libc_gpu_benchmarks::FunctionBenchmark \
- SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
-
-#define BENCHMARK_WRAPPER(SuiteName, TestName, Func) \
+#define BENCHMARK(SuiteName, TestName, Func) \
LIBC_NAMESPACE::libc_gpu_benchmarks::WrapperBenchmark \
SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
diff --git a/libc/benchmarks/gpu/dummy.cpp b/libc/benchmarks/gpu/dummy.cpp
deleted file mode 100644
index fa9b220f33b23..0000000000000
--- a/libc/benchmarks/gpu/dummy.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "LibcGpuBenchmark.h"
-#include "timing/timing.h"
-
-int add_test(int x) { return x + 1; }
-
-__attribute__((noinline)) [[gnu::noinline]] int function_call_overhead(int x) {
- asm volatile("");
- return x;
-}
-
-uint64_t DummyWrapperBenchmark() {
- int x = 10;
- return LIBC_NAMESPACE::latency(add_test, x);
-}
-BENCHMARK_WRAPPER(Dummy, DummyWrapperBenchmark, DummyWrapperBenchmark);
-
-uint64_t DummyFunctionCallOverhead() {
- int x = 10;
- return LIBC_NAMESPACE::latency(function_call_overhead, x);
-}
-BENCHMARK_WRAPPER(Dummy, DummyFunctionCallOverhead, DummyFunctionCallOverhead);
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
index 071675bb887b6..8d9c958bb7ed4 100644
--- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -2,23 +2,21 @@
#include "src/ctype/isalnum.h"
-// BENCHMARK_SINGLE_INPUT_OUTPUT(LlvmLibcIsAlNumGpuBenchmark,
-// IsAlnumSingleInputOutput,
-// LIBC_NAMESPACE::isalnum, 'c');
-
-// void BM_IsAlnumBasic() { bool isAlpha = LIBC_NAMESPACE::isalnum('c'); }
-// BENCHMARK_FN(LlvmLibcIsAlNumGpuBenchmark, IsAlnumC, BM_IsAlnumBasic);
-
-uint64_t BM_IsAlnumWrapper() {
+uint64_t BM_IsAlnum() {
char x = 'c';
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
}
-BENCHMARK_WRAPPER(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper,
- BM_IsAlnumWrapper);
+BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper, BM_IsAlnum);
+
+[[gnu::noinline]] static uint64_t single_input_function(int x) {
+ asm volatile("" ::"r"(x)); // prevent the compiler from optimizing out x
+ return x;
+}
uint64_t BM_IsAlnumWithOverhead() {
char x = 'c';
- return LIBC_NAMESPACE::function_call_latency(LIBC_NAMESPACE::isalnum, x);
+ return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x) -
+ LIBC_NAMESPACE::latency(single_input_function, 0);
}
-BENCHMARK_WRAPPER(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWithOverhead,
- BM_IsAlnumWithOverhead);
+BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWithOverhead,
+ BM_IsAlnumWithOverhead);
diff --git a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
index e432b1cf72c5f..2038eb89bc77b 100644
--- a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
@@ -6,4 +6,4 @@ uint64_t BM_IsAlpha() {
char x = 'c';
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
}
-BENCHMARK_WRAPPER(LlvmLibcIsAlphaGpuBenchmark, IsAlpha, BM_IsAlpha);
+BENCHMARK(LlvmLibcIsAlphaGpuBenchmark, IsAlpha, BM_IsAlpha);
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 39c23d596b7f3..008432e6aa1d2 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -79,38 +79,6 @@ template <typename F, typename T>
return stop - start;
}
-[[gnu::noinline]] static uint64_t single_input_function(int x) {
- asm volatile("" :: "r"(x)); // prevent the compiler from optimizing out x
- return x;
-}
-
-template <typename F, typename T>
-static LIBC_INLINE uint64_t function_call_latency(F f, T t) {
- auto function_call_overhead = latency(single_input_function, 0);
- return latency(f, t) - function_call_overhead;
-}
-
-static LIBC_INLINE uint64_t latency(void (*f)()) {
- // Get the current timestamp from the clock.
- gpu::sync_threads();
- uint64_t start = gpu::processor_clock();
-
- // This forces the compiler to load the input argument and run the clock cycle
- // counter before the profiling region.
- asm volatile("" ::"llr"(start));
-
- // Run the function under test and return its value.
- f();
-
- // Obtain the current timestamp after running the calculation and force
- // ordering.
- uint64_t stop = gpu::processor_clock();
- gpu::sync_threads();
-
- // Return the time elapsed.
- return stop - start;
-}
-
template <typename F, typename T1, typename T2>
static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
volatile T1 storage = t1;
@@ -135,41 +103,6 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
return stop - start;
}
-
} // namespace LIBC_NAMESPACE
-/**
- * LatencyP must be a pointer to a uint64_t holding the result variable
- */
-#define SINGLE_INPUT_OUTPUT_LATENCY(Func, t, LatencyP) \
- do { \
- tlog << "Latency: " << ((long)(*LatencyP)) << '\n'; \
- *LatencyP = 200; \
- volatile auto storage = t; \
- auto arg = storage; \
- asm volatile("" ::"r"(arg), "r"(LatencyP)); \
- \
- LIBC_NAMESPACE::gpu::sync_threads(); \
- uint64_t start = LIBC_NAMESPACE::gpu::processor_clock(); \
- \
- asm volatile("" ::"r"(arg), "llr"(start)); \
- auto result = Func(arg); \
- asm volatile("" ::"r"(LatencyP)); \
- *LatencyP = 312; \
- asm volatile("" ::"r"(LatencyP)); \
- asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); \
- asm volatile("" ::"r"(LatencyP)); \
- *LatencyP = 499; \
- \
- uint64_t stop = gpu::processor_clock(); \
- gpu::sync_threads(); \
- volatile auto output = result; \
- \
- tlog << "Start: " << start << '\n'; \
- tlog << "Stop: " << stop << '\n'; \
- tlog << "Diff: " << (stop - start) << '\n'; \
- asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(LatencyP) :); \
- *LatencyP = stop - start; \
- } while (0)
-
#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
diff --git a/libc/benchmarks/gpu/timing/timing.h b/libc/benchmarks/gpu/timing/timing.h
index 45201e56964e6..f37381fdf65db 100644
--- a/libc/benchmarks/gpu/timing/timing.h
+++ b/libc/benchmarks/gpu/timing/timing.h
@@ -19,4 +19,5 @@
#error "unsupported platform"
#endif
-#endif // LLVM_LIBC_UTILS_GPU_TIMING_H
\ No newline at end of file
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_H
+
More information about the libc-commits
mailing list