[libc-commits] [libc] [libc] NVPTX Profiling Draft (PR #92009)
via libc-commits
libc-commits at lists.llvm.org
Sat Jun 15 11:30:21 PDT 2024
https://github.com/jameshu15869 updated https://github.com/llvm/llvm-project/pull/92009
>From b6b47fb3b6d7560d667efb0841710740be3db714 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 11 May 2024 17:19:54 -0400
Subject: [PATCH 01/16] initial nvptx microbenchmarking infrastructure
---
libc/benchmarks/CMakeLists.txt | 416 +++++++++---------
libc/benchmarks/gpu/BenchmarkLogger.cpp | 89 ++++
libc/benchmarks/gpu/BenchmarkLogger.h | 27 ++
libc/benchmarks/gpu/CMakeLists.txt | 183 ++++++++
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 70 +++
libc/benchmarks/gpu/LibcGpuBenchmark.h | 122 +++++
libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp | 6 +
libc/benchmarks/gpu/src/CMakeLists.txt | 2 +
libc/benchmarks/gpu/src/ctype/CMakeLists.txt | 21 +
.../gpu/src/ctype/isalnum_benchmark.cpp | 22 +
.../gpu/src/ctype/isalpha_benchmark.cpp | 9 +
libc/benchmarks/gpu/src/math/CMakeLists.txt | 0
libc/benchmarks/gpu/timing/CMakeLists.txt | 12 +
.../gpu/timing/nvptx/CMakeLists.txt | 7 +
libc/benchmarks/gpu/timing/nvptx/timing.h | 108 +++++
libc/benchmarks/gpu/timing/timing.h | 22 +
16 files changed, 911 insertions(+), 205 deletions(-)
create mode 100644 libc/benchmarks/gpu/BenchmarkLogger.cpp
create mode 100644 libc/benchmarks/gpu/BenchmarkLogger.h
create mode 100644 libc/benchmarks/gpu/CMakeLists.txt
create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmark.cpp
create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmark.h
create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
create mode 100644 libc/benchmarks/gpu/src/CMakeLists.txt
create mode 100644 libc/benchmarks/gpu/src/ctype/CMakeLists.txt
create mode 100644 libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
create mode 100644 libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
create mode 100644 libc/benchmarks/gpu/src/math/CMakeLists.txt
create mode 100644 libc/benchmarks/gpu/timing/CMakeLists.txt
create mode 100644 libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
create mode 100644 libc/benchmarks/gpu/timing/nvptx/timing.h
create mode 100644 libc/benchmarks/gpu/timing/timing.h
diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 4978da65850cc..a802e653a091e 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -1,205 +1,211 @@
-find_package(Threads)
-
-set(LLVM_LINK_COMPONENTS
- Support
- TargetParser
- )
-
-#==============================================================================
-# Add Unit Testing Support
-#==============================================================================
-
-function(add_libc_benchmark_unittest target_name)
- if(NOT LLVM_INCLUDE_TESTS)
- return()
- endif()
-
- cmake_parse_arguments(
- "LIBC_BENCHMARKS_UNITTEST"
- "" # No optional arguments
- "SUITE" # Single value arguments
- "SRCS;DEPENDS" # Multi-value arguments
- ${ARGN}
- )
-
- add_executable(${target_name}
- EXCLUDE_FROM_ALL
- ${LIBC_BENCHMARKS_UNITTEST_SRCS}
- )
- target_link_libraries(${target_name}
- PRIVATE
- llvm_gtest_main
- llvm_gtest
- ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
- )
- llvm_update_compile_flags(${target_name})
-
- add_custom_command(
- TARGET ${target_name}
- POST_BUILD
- COMMAND $<TARGET_FILE:${target_name}>
- )
- add_dependencies(libc-benchmark-util-tests ${target_name})
-endfunction()
-
-#==============================================================================
-# Build Google Benchmark for libc
-#==============================================================================
-
-include(ExternalProject)
-ExternalProject_Add(google-benchmark-libc
- EXCLUDE_FROM_ALL ON
- PREFIX google-benchmark-libc
- SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
- INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
- CMAKE_CACHE_ARGS
- -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
- -DBENCHMARK_ENABLE_LTO:BOOL=OFF
- -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
- -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
- -DBENCHMARK_FORCE_WERROR:BOOL=OFF
- -DBENCHMARK_USE_LIBCXX:BOOL=OFF
- -DCMAKE_BUILD_TYPE:STRING=Release
-
- -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
- -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
- -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
- -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
- -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
- -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
-
- -DBUILD_SHARED_LIBS:BOOL=OFF
- -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
-
- -DCMAKE_CXX_STANDARD:STRING=14
- -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
- )
-
-add_custom_target(libc-benchmark-util-tests)
-
-# libc-benchmark
-add_library(libc-benchmark
- STATIC
- EXCLUDE_FROM_ALL
- LibcBenchmark.cpp
- LibcBenchmark.h
-)
-
-target_include_directories(libc-benchmark
- PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
-)
-target_link_libraries(libc-benchmark
- PUBLIC
- benchmark::benchmark
- LLVMSupport
- LLVMTargetParser
- Threads::Threads
-)
-add_dependencies(libc-benchmark google-benchmark-libc)
-llvm_update_compile_flags(libc-benchmark)
-
-add_libc_benchmark_unittest(libc-benchmark-test
- SRCS LibcBenchmarkTest.cpp
- DEPENDS libc-benchmark
-)
-
-# libc-memory-benchmark
-add_library(libc-memory-benchmark
- STATIC
- EXCLUDE_FROM_ALL
- LibcMemoryBenchmark.cpp
- LibcMemoryBenchmark.h
- LibcFunctionPrototypes.h
- MemorySizeDistributions.cpp
- MemorySizeDistributions.h
-)
-target_include_directories(libc-memory-benchmark
- PUBLIC
- ${CMAKE_CURRENT_SOURCE_DIR}
-)
-target_link_libraries(libc-memory-benchmark
- PUBLIC
- libc-benchmark
-)
-llvm_update_compile_flags(libc-memory-benchmark)
-
-add_libc_benchmark_unittest(libc-memory-benchmark-test
- SRCS LibcMemoryBenchmarkTest.cpp
- DEPENDS libc-memory-benchmark
-)
-
-# json
-add_library(json
- STATIC
- EXCLUDE_FROM_ALL
- JSON.cpp
- JSON.h
-)
-target_link_libraries(json PUBLIC libc-memory-benchmark)
-llvm_update_compile_flags(json)
-
-add_libc_benchmark_unittest(json-test
- SRCS JSONTest.cpp
- DEPENDS json
-)
-
-#==============================================================================
-# Benchmarking tool
-#==============================================================================
-
-# Benchmark all implementations that can run on the target CPU.
-function(add_libc_multi_impl_benchmark name)
- get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
- foreach(fq_config_name IN LISTS fq_implementations)
- get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
- cpu_supports(can_run "${required_cpu_features}")
- if(can_run)
- set(benchmark_name ${fq_config_name}_benchmark)
- add_executable(${benchmark_name}
- EXCLUDE_FROM_ALL
- LibcMemoryBenchmarkMain.cpp
- )
- get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
- target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
- string(TOUPPER ${name} name_upper)
- target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
- llvm_update_compile_flags(${benchmark_name})
- else()
- message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
- endif()
- endforeach()
-endfunction()
-
-add_libc_multi_impl_benchmark(bcmp)
-add_libc_multi_impl_benchmark(bzero)
-add_libc_multi_impl_benchmark(memcmp)
-add_libc_multi_impl_benchmark(memcpy)
-add_libc_multi_impl_benchmark(memmove)
-add_libc_multi_impl_benchmark(memset)
-
-#==============================================================================
-# Google Benchmarking tool
-#==============================================================================
-
-# This target uses the Google Benchmark facility to report throughput for llvm
-# libc memory functions compiled for the host machine. This is useful to
-# continuously monitor the performance of the memory functions.
-add_executable(libc.benchmarks.memory_functions.opt_host
- EXCLUDE_FROM_ALL
- LibcMemoryGoogleBenchmarkMain.cpp
- LibcDefaultImplementations.cpp
-)
-target_link_libraries(libc.benchmarks.memory_functions.opt_host
- PRIVATE
- libc-memory-benchmark
- libc.src.string.memcmp_opt_host.__internal__
- libc.src.string.bcmp_opt_host.__internal__
- libc.src.string.memcpy_opt_host.__internal__
- libc.src.string.memset_opt_host.__internal__
- libc.src.string.bzero_opt_host.__internal__
- libc.src.string.memmove_opt_host.__internal__
- benchmark_main
-)
-llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host)
-
-add_subdirectory(automemcpy)
+if(NOT LIBC_TARGET_OS_IS_GPU)
+ find_package(Threads)
+
+ set(LLVM_LINK_COMPONENTS
+ Support
+ TargetParser
+ )
+
+ #==============================================================================
+ # Add Unit Testing Support
+ #==============================================================================
+
+ function(add_libc_benchmark_unittest target_name)
+ if(NOT LLVM_INCLUDE_TESTS)
+ return()
+ endif()
+
+ cmake_parse_arguments(
+ "LIBC_BENCHMARKS_UNITTEST"
+ "" # No optional arguments
+ "SUITE" # Single value arguments
+ "SRCS;DEPENDS" # Multi-value arguments
+ ${ARGN}
+ )
+
+ add_executable(${target_name}
+ EXCLUDE_FROM_ALL
+ ${LIBC_BENCHMARKS_UNITTEST_SRCS}
+ )
+ target_link_libraries(${target_name}
+ PRIVATE
+ llvm_gtest_main
+ llvm_gtest
+ ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
+ )
+ llvm_update_compile_flags(${target_name})
+
+ add_custom_command(
+ TARGET ${target_name}
+ POST_BUILD
+ COMMAND $<TARGET_FILE:${target_name}>
+ )
+ add_dependencies(libc-benchmark-util-tests ${target_name})
+ endfunction()
+
+ #==============================================================================
+ # Build Google Benchmark for libc
+ #==============================================================================
+
+ include(ExternalProject)
+ ExternalProject_Add(google-benchmark-libc
+ EXCLUDE_FROM_ALL ON
+ PREFIX google-benchmark-libc
+ SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
+ INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
+ CMAKE_CACHE_ARGS
+ -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
+ -DBENCHMARK_ENABLE_LTO:BOOL=OFF
+ -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
+ -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
+ -DBENCHMARK_FORCE_WERROR:BOOL=OFF
+ -DBENCHMARK_USE_LIBCXX:BOOL=OFF
+ -DCMAKE_BUILD_TYPE:STRING=Release
+
+ -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
+ -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
+ -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
+ -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
+ -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
+ -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
+
+ -DBUILD_SHARED_LIBS:BOOL=OFF
+ -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
+
+ -DCMAKE_CXX_STANDARD:STRING=14
+ -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
+ )
+
+ add_custom_target(libc-benchmark-util-tests)
+
+ # libc-benchmark
+ add_library(libc-benchmark
+ STATIC
+ EXCLUDE_FROM_ALL
+ LibcBenchmark.cpp
+ LibcBenchmark.h
+ )
+
+ target_include_directories(libc-benchmark
+ PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
+ )
+ target_link_libraries(libc-benchmark
+ PUBLIC
+ benchmark::benchmark
+ LLVMSupport
+ LLVMTargetParser
+ Threads::Threads
+ )
+ add_dependencies(libc-benchmark google-benchmark-libc)
+ llvm_update_compile_flags(libc-benchmark)
+
+ add_libc_benchmark_unittest(libc-benchmark-test
+ SRCS LibcBenchmarkTest.cpp
+ DEPENDS libc-benchmark
+ )
+
+ # libc-memory-benchmark
+ add_library(libc-memory-benchmark
+ STATIC
+ EXCLUDE_FROM_ALL
+ LibcMemoryBenchmark.cpp
+ LibcMemoryBenchmark.h
+ LibcFunctionPrototypes.h
+ MemorySizeDistributions.cpp
+ MemorySizeDistributions.h
+ )
+ target_include_directories(libc-memory-benchmark
+ PUBLIC
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ )
+ target_link_libraries(libc-memory-benchmark
+ PUBLIC
+ libc-benchmark
+ )
+ llvm_update_compile_flags(libc-memory-benchmark)
+
+ add_libc_benchmark_unittest(libc-memory-benchmark-test
+ SRCS LibcMemoryBenchmarkTest.cpp
+ DEPENDS libc-memory-benchmark
+ )
+
+ # json
+ add_library(json
+ STATIC
+ EXCLUDE_FROM_ALL
+ JSON.cpp
+ JSON.h
+ )
+ target_link_libraries(json PUBLIC libc-memory-benchmark)
+ llvm_update_compile_flags(json)
+
+ add_libc_benchmark_unittest(json-test
+ SRCS JSONTest.cpp
+ DEPENDS json
+ )
+
+ #==============================================================================
+ # Benchmarking tool
+ #==============================================================================
+
+ # Benchmark all implementations that can run on the target CPU.
+ function(add_libc_multi_impl_benchmark name)
+ get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
+ foreach(fq_config_name IN LISTS fq_implementations)
+ get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
+ cpu_supports(can_run "${required_cpu_features}")
+ if(can_run)
+ set(benchmark_name ${fq_config_name}_benchmark)
+ add_executable(${benchmark_name}
+ EXCLUDE_FROM_ALL
+ LibcMemoryBenchmarkMain.cpp
+ )
+ get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
+ target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
+ string(TOUPPER ${name} name_upper)
+ target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
+ llvm_update_compile_flags(${benchmark_name})
+ else()
+ message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
+ endif()
+ endforeach()
+ endfunction()
+
+ add_libc_multi_impl_benchmark(bcmp)
+ add_libc_multi_impl_benchmark(bzero)
+ add_libc_multi_impl_benchmark(memcmp)
+ add_libc_multi_impl_benchmark(memcpy)
+ add_libc_multi_impl_benchmark(memmove)
+ add_libc_multi_impl_benchmark(memset)
+
+ #==============================================================================
+ # Google Benchmarking tool
+ #==============================================================================
+
+ # This target uses the Google Benchmark facility to report throughput for llvm
+ # libc memory functions compiled for the host machine. This is useful to
+ # continuously monitor the performance of the memory functions.
+ add_executable(libc.benchmarks.memory_functions.opt_host
+ EXCLUDE_FROM_ALL
+ LibcMemoryGoogleBenchmarkMain.cpp
+ LibcDefaultImplementations.cpp
+ )
+ target_link_libraries(libc.benchmarks.memory_functions.opt_host
+ PRIVATE
+ libc-memory-benchmark
+ libc.src.string.memcmp_opt_host.__internal__
+ libc.src.string.bcmp_opt_host.__internal__
+ libc.src.string.memcpy_opt_host.__internal__
+ libc.src.string.memset_opt_host.__internal__
+ libc.src.string.bzero_opt_host.__internal__
+ libc.src.string.memmove_opt_host.__internal__
+ benchmark_main
+ )
+ llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host)
+
+ add_subdirectory(automemcpy)
+endif()
+
+if(LIBC_TARGET_OS_IS_GPU)
+ add_subdirectory(gpu)
+endif()
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
new file mode 100644
index 0000000000000..94a0d897c9585
--- /dev/null
+++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp
@@ -0,0 +1,89 @@
+#include "benchmarks/gpu/BenchmarkLogger.h"
+#include "src/__support/CPP/string.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/io.h" // write_to_stderr
+#include "src/__support/big_int.h" // is_big_int
+#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
+#include "src/__support/uint128.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+namespace libc_gpu_benchmarks {
+
+// cpp::string_view specialization
+template <>
+BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) {
+ LIBC_NAMESPACE::write_to_stderr(str);
+ return *this;
+}
+
+// cpp::string specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) {
+ return *this << static_cast<cpp::string_view>(str);
+}
+
+// const char* specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) {
+ return *this << cpp::string_view(str);
+}
+
+// char* specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<< <char *>(char *str) {
+ return *this << cpp::string_view(str);
+}
+
+// char specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) {
+ return *this << cpp::string_view(&ch, 1);
+}
+
+// bool specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) {
+ return *this << (cond ? "true" : "false");
+}
+
+// void * specialization
+template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) {
+ return *this << "0x" << cpp::to_string(reinterpret_cast<uintptr_t>(addr));
+}
+
+template <typename T> BenchmarkLogger &BenchmarkLogger::operator<<(T t) {
+ if constexpr (is_big_int_v<T> ||
+ (cpp::is_integral_v<T> && cpp::is_unsigned_v<T> &&
+ (sizeof(T) > sizeof(uint64_t)))) {
+ static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt");
+ const IntegerToString<T, radix::Hex::WithPrefix> buffer(t);
+ return *this << buffer.view();
+ } else {
+ return *this << cpp::to_string(t);
+ }
+}
+
+// is_integral specializations
+// char is already specialized to handle character
+template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short);
+template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int);
+template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long);
+template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned char>(unsigned char);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned short>(unsigned short);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned int>(unsigned int);
+template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned long>(unsigned long);
+template BenchmarkLogger &
+ BenchmarkLogger::operator<< <unsigned long long>(unsigned long long);
+
+#ifdef LIBC_TYPES_HAS_INT128
+template BenchmarkLogger &BenchmarkLogger::operator<< <__uint128_t>(__uint128_t);
+#endif // LIBC_TYPES_HAS_INT128
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<256>>(UInt<256>);
+template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>);
+
+// TODO: Add floating point formatting once it's supported by StringStream.
+
+BenchmarkLogger blog;
+
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h
new file mode 100644
index 0000000000000..ed3cc97e59c6d
--- /dev/null
+++ b/libc/benchmarks/gpu/BenchmarkLogger.h
@@ -0,0 +1,27 @@
+//===-- Utilities to log to standard output during tests --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
+#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
+
+namespace LIBC_NAMESPACE {
+namespace libc_gpu_benchmarks {
+
+// A class to log to standard output in the context of hermetic tests.
+struct BenchmarkLogger {
+ constexpr BenchmarkLogger() = default;
+ template <typename T> BenchmarkLogger &operator<<(T);
+};
+
+// A global TestLogger instance to be used in tests.
+extern BenchmarkLogger blog;
+
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
+
+#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
new file mode 100644
index 0000000000000..a18be27e33573
--- /dev/null
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -0,0 +1,183 @@
+add_subdirectory(timing)
+
+add_custom_target(gpu-benchmark)
+
+function (add_gpu_benchmark test_name)
+ if(NOT TARGET libc.startup.${LIBC_TARGET_OS}.crt1)
+ message(VERBOSE "Skipping ${fq_target_name} as it is not available on ${LIBC_TARGET_OS}.")
+ return()
+ endif()
+
+ cmake_parse_arguments(
+ "GPU_BENCHMARK"
+ "" # No optional arguments
+ "SUITE" # Single value arguments
+ "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
+ ${ARGN}
+ )
+
+ if(NOT GPU_BENCHMARK_SUITE)
+ message(FATAL_ERROR "SUITE not specified for ${fq_target_name}")
+ endif()
+ if(NOT GPU_BENCHMARK_SRCS)
+ message(FATAL_ERROR "The SRCS list for add_gpu_benchmark is missing.")
+ endif()
+
+ get_fq_target_name(${test_name} fq_target_name)
+ get_fq_target_name(${test_name}.libc fq_libc_target_name) # Stores the compiled libc + infrastructure archive to link in
+ get_fq_deps_list(fq_deps_list ${GPU_BENCHMARK_DEPENDS})
+ list(APPEND fq_deps_list
+ # Hermetic tests use the platform's startup object. So, their deps also
+ # have to be collected.
+ libc.startup.${LIBC_TARGET_OS}.crt1
+ # We always add the memory functions objects. This is because the
+ # compiler's codegen can emit calls to the C memory functions.
+ libc.src.string.bcmp
+ libc.src.string.bzero
+ libc.src.string.memcmp
+ libc.src.string.memcpy
+ libc.src.string.memmove
+ libc.src.string.memset
+ libc.src.__support.StringUtil.error_to_string
+ )
+
+ list(REMOVE_DUPLICATES fq_deps_list)
+
+ # TODO: Instead of gathering internal object files from entrypoints,
+ # collect the object files with public names of entrypoints.
+ get_object_files_for_test(
+ link_object_files skipped_entrypoints_list ${fq_deps_list})
+ if(skipped_entrypoints_list)
+ if(LIBC_CMAKE_VERBOSE_LOGGING)
+ set(msg "Skipping hermetic test ${fq_target_name} as it has missing deps: "
+ "${skipped_entrypoints_list}.")
+ endif()
+ return()
+ endif()
+ list(REMOVE_DUPLICATES link_object_files)
+
+ # Make a library of all deps
+ add_library(
+ ${fq_target_name}.__libc__
+ STATIC
+ EXCLUDE_FROM_ALL
+ ${link_object_files}
+ )
+ set_target_properties(${fq_target_name}.__libc__
+ PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+ set_target_properties(${fq_target_name}.__libc__
+ PROPERTIES ARCHIVE_OUTPUT_NAME ${fq_target_name}.libc)
+
+ set(fq_build_target_name ${fq_target_name}.__build__)
+ add_executable(
+ ${fq_build_target_name}
+ EXCLUDE_FROM_ALL
+ $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
+ ${GPU_BENCHMARK_SRCS}
+ ${GPU_BENCHMARK_HDRS}
+ )
+ set_target_properties(${fq_build_target_name}
+ PROPERTIES
+ RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+ )
+
+ _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
+ target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+ target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+ _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
+ target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
+
+ set(link_libraries "")
+ foreach(lib in LISTS GPU_BENCHMARK_LINK_LIBRARIES)
+ if(TARGET ${lib}.hermetic)
+ list(APPEND link_libraries ${lib}.hermetic)
+ else()
+ list(APPEND link_libraries ${lib})
+ endif()
+ endforeach()
+
+ if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+ target_link_options(${fq_build_target_name} PRIVATE
+ ${LIBC_COMPILE_OPTIONS_DEFAULT}
+ -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -Wno-multi-gpu
+ "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
+ "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
+ elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+ # We need to use the internal object versions for NVPTX.
+ set(internal_suffix ".__internal__")
+ target_link_options(${fq_build_target_name} PRIVATE
+ ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
+ "-Wl,--suppress-stack-size-warning"
+ -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
+ "--cuda-path=${LIBC_CUDA_ROOT}")
+ elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
+ target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
+ else()
+ # Older version of gcc does not support `nostdlib++` flag. We use
+ # `nostdlib` and link against libgcc_s, which cannot be linked statically.
+ target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib)
+ list(APPEND link_libraries ${LIBGCC_S_LOCATION})
+ endif()
+
+ # link libraries for the BUILD target (i.e. to compile the test)
+ target_link_libraries(
+ ${fq_build_target_name}
+ PRIVATE
+ libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
+ ${link_libraries}
+ # LibcTest.hermetic
+ LibcGpuBenchmark.hermetic
+ # LibcHermeticTestSupport.hermetic
+ LibcHermeticTestSupport.hermetic
+ # The NVIDIA 'nvlink' linker does not currently support static libraries.
+ $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
+
+ add_dependencies(${fq_build_target_name}
+ LibcGpuBenchmark.hermetic
+ ${fq_deps_list})
+
+ # Tests on the GPU require an external loader utility to launch the kernel.
+ if(TARGET libc.utils.gpu.loader)
+ add_dependencies(${fq_build_target_name} libc.utils.gpu.loader)
+ get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE")
+ endif()
+
+ set(test_cmd ${GPU_BENCHMARK_ENV}
+ $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${GPU_BENCHMARK_LOADER_ARGS}
+ $<TARGET_FILE:${fq_build_target_name}> ${GPU_BENCHMARK_ARGS})
+ add_custom_target(
+ ${fq_target_name}
+ COMMAND ${test_cmd}
+ COMMAND_EXPAND_LISTS
+ COMMENT "Running GPU benchmark ${fq_target_name}"
+ )
+
+ # Make this benchmark part of its suite
+ add_dependencies(${GPU_BENCHMARK_SUITE} ${fq_target_name})
+ # Remember to make this benchmark part of the umbrella command
+ add_dependencies(gpu-benchmark ${fq_target_name})
+endfunction(add_gpu_benchmark)
+
+add_unittest_framework_library(
+ LibcGpuBenchmark
+ SRCS
+ LibcGpuBenchmark.cpp
+ LibcGpuBenchmarkMain.cpp
+ BenchmarkLogger.cpp
+ HDRS
+ LibcGpuBenchmark.h
+ BenchmarkLogger.h
+ DEPENDS
+ libc.src.__support.big_int
+ libc.src.__support.c_string
+ libc.src.__support.CPP.string
+ libc.src.__support.CPP.string_view
+ libc.src.__support.CPP.type_traits
+ libc.src.__support.fixed_point.fx_rep
+ libc.src.__support.macros.properties.types
+ libc.src.__support.OSUtil.osutil
+ libc.src.__support.uint128
+ libc.benchmarks.gpu.timing.timing
+)
+
+add_subdirectory(src)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
new file mode 100644
index 0000000000000..d37f5a0a53a70
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -0,0 +1,70 @@
+#include "LibcGpuBenchmark.h"
+
+namespace LIBC_NAMESPACE {
+namespace libc_gpu_benchmarks {
+
+Benchmark *Benchmark::Start = nullptr;
+Benchmark *Benchmark::End = nullptr;
+
+void Benchmark::addBenchmark(Benchmark *B) {
+ if (End == nullptr) {
+ Start = B;
+ End = B;
+ return;
+ }
+
+ End->Next = B;
+ End = B;
+}
+
+int Benchmark::runBenchmarks() {
+ for (Benchmark *B = Start; B != nullptr; B = B->Next) {
+ B->Run();
+ }
+
+ return 0;
+}
+
+BenchmarkResult benchmark(const BenchmarkOptions &Options,
+ uint64_t (*WrapperFunc)()) {
+ BenchmarkResult Result;
+ RuntimeEstimationProgression REP;
+ size_t TotalIterations = 0;
+ size_t Iterations = Options.InitialIterations;
+ if (Iterations < (uint32_t)1) {
+ Iterations = 1;
+ }
+ size_t Samples = 0;
+ uint64_t BestGuess = 0;
+ uint64_t TotalCycles = 0;
+ for (;;) {
+ uint64_t SampleCycles = 0;
+ for (uint32_t i = 0; i < Iterations; i++) {
+ auto overhead = LIBC_NAMESPACE::overhead();
+ uint64_t result = WrapperFunc() - overhead;
+ SampleCycles += result;
+ }
+
+ Samples++;
+ TotalCycles += SampleCycles;
+ TotalIterations += Iterations;
+ const double ChangeRatio =
+ REP.ComputeImprovement({Iterations, SampleCycles});
+ BestGuess = REP.CurrentEstimation;
+
+ if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
+ break;
+ } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
+ break;
+ }
+
+ Iterations *= Options.ScalingFactor;
+ }
+ Result.Cycles = BestGuess;
+ Result.Samples = Samples;
+ Result.TotalIterations = TotalIterations;
+ return Result;
+};
+
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
new file mode 100644
index 0000000000000..ccbbe3629dbda
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -0,0 +1,122 @@
+#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
+#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
+
+#include "benchmarks/gpu/timing/timing.h"
+
+#include "benchmarks/gpu/BenchmarkLogger.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+
+namespace libc_gpu_benchmarks {
+
+struct BenchmarkOptions {
+ uint32_t InitialIterations = 1;
+ uint32_t MaxIterations = 10000000;
+ uint32_t MinSamples = 4;
+ uint32_t MaxSamples = 1000;
+ double Epsilon = 0.01;
+ double ScalingFactor = 1.4;
+};
+
+struct Measurement {
+ size_t Iterations = 0;
+ uint64_t ElapsedCycles = 0;
+};
+
+class RefinableRuntimeEstimation {
+ uint64_t TotalCycles = 0;
+ size_t TotalIterations = 0;
+
+public:
+ uint64_t Update(const Measurement &M) {
+ TotalCycles += M.ElapsedCycles;
+ TotalIterations += M.Iterations;
+ return TotalCycles / TotalIterations;
+ }
+};
+
+// Tracks the progression of the runtime estimation
+class RuntimeEstimationProgression {
+ RefinableRuntimeEstimation RRE;
+
+public:
+ uint64_t CurrentEstimation = 0;
+
+ double ComputeImprovement(const Measurement &M) {
+ const uint64_t NewEstimation = RRE.Update(M);
+ double Ratio = ((double)CurrentEstimation / NewEstimation) - 1.0;
+
+ // Get absolute value
+ if (Ratio < 0) {
+ Ratio *= -1;
+ }
+
+ CurrentEstimation = NewEstimation;
+ return Ratio;
+ }
+};
+
+struct BenchmarkResult {
+ uint64_t Cycles = 0;
+ size_t Samples = 0;
+ size_t TotalIterations = 0;
+};
+
+BenchmarkResult benchmark(const BenchmarkOptions &Options,
+ uint64_t (*WrapperFunc)());
+
+class Benchmark {
+ Benchmark *Next = nullptr;
+
+public:
+ virtual ~Benchmark() {}
+ virtual void SetUp() {}
+ virtual void TearDown() {}
+
+ static int runBenchmarks();
+
+protected:
+ static void addBenchmark(Benchmark *);
+
+private:
+ virtual void Run() = 0;
+ virtual const char *getName() const = 0;
+
+ static Benchmark *Start;
+ static Benchmark *End;
+};
+
+class WrapperBenchmark : public Benchmark {
+ using BenchmarkWrapperFunction = uint64_t (*)();
+ BenchmarkWrapperFunction Func;
+ const char *Name;
+
+public:
+ WrapperBenchmark(BenchmarkWrapperFunction Func, char const *Name)
+ : Func(Func), Name(Name) {
+ addBenchmark(this);
+ }
+
+private:
+ void Run() override {
+ BenchmarkOptions Options;
+ auto result = benchmark(Options, Func);
+ constexpr auto GREEN = "\033[32m";
+ constexpr auto RESET = "\033[0m";
+ blog << GREEN << "[ RUN ] " << RESET << Name << '\n';
+ blog << GREEN << "[ OK ] " << RESET << Name << ": " << result.Cycles
+ << " cycles, " << result.TotalIterations << " iterations\n";
+ }
+ const char *getName() const override { return Name; }
+};
+} // namespace libc_gpu_benchmarks
+} // namespace LIBC_NAMESPACE
+
+#define BENCHMARK(SuiteName, TestName, Func) \
+ LIBC_NAMESPACE::libc_gpu_benchmarks::WrapperBenchmark \
+ SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
+
+#endif
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
new file mode 100644
index 0000000000000..c971b00cc9a1b
--- /dev/null
+++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
@@ -0,0 +1,6 @@
+#include "LibcGpuBenchmark.h"
+
+extern "C" int main(int argc, char **argv, char **envp) {
+ LIBC_NAMESPACE::libc_gpu_benchmarks::Benchmark::runBenchmarks();
+ return 0;
+}
diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt
new file mode 100644
index 0000000000000..f15d082e4dd2b
--- /dev/null
+++ b/libc/benchmarks/gpu/src/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(ctype)
+add_subdirectory(math)
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
new file mode 100644
index 0000000000000..ab2f6cdf0c7fd
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_custom_target(libc-gpu-ctype-benchmarks)
+
+add_gpu_benchmark(
+ isalnum_benchmark
+ SUITE
+ libc-gpu-ctype-benchmarks
+ SRCS
+ isalnum_benchmark.cpp
+ DEPENDS
+ libc.src.ctype.isalnum
+)
+
+add_gpu_benchmark(
+ isalpha_benchmark
+ SUITE
+ libc-gpu-ctype-benchmarks
+ SRCS
+ isalpha_benchmark.cpp
+ DEPENDS
+ libc.src.ctype.isalpha
+)
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
new file mode 100644
index 0000000000000..8d9c958bb7ed4
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -0,0 +1,22 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/ctype/isalnum.h"
+
+uint64_t BM_IsAlnum() {
+ char x = 'c';
+ return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
+}
+BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper, BM_IsAlnum);
+
+[[gnu::noinline]] static uint64_t single_input_function(int x) {
+ asm volatile("" ::"r"(x)); // prevent the compiler from optimizing out x
+ return x;
+}
+
+uint64_t BM_IsAlnumWithOverhead() {
+ char x = 'c';
+ return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x) -
+ LIBC_NAMESPACE::latency(single_input_function, 0);
+}
+BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWithOverhead,
+ BM_IsAlnumWithOverhead);
diff --git a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
new file mode 100644
index 0000000000000..2038eb89bc77b
--- /dev/null
+++ b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp
@@ -0,0 +1,9 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/ctype/isalpha.h"
+
+uint64_t BM_IsAlpha() {
+ char x = 'c';
+ return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x);
+}
+BENCHMARK(LlvmLibcIsAlphaGpuBenchmark, IsAlpha, BM_IsAlpha);
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/libc/benchmarks/gpu/timing/CMakeLists.txt b/libc/benchmarks/gpu/timing/CMakeLists.txt
new file mode 100644
index 0000000000000..0e6a5a6b47968
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/CMakeLists.txt
@@ -0,0 +1,12 @@
+foreach(target nvptx)
+ add_subdirectory(${target})
+ list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing)
+endforeach()
+
+add_header_library(
+ timing
+ HDRS
+ timing.h
+ DEPENDS
+ ${target_gpu_timing}
+)
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
new file mode 100644
index 0000000000000..9958e16206a41
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_header_library(
+ nvptx_timing
+ HDRS
+ timing.h
+ DEPENDS
+ libc.src.__support.common
+)
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
new file mode 100644
index 0000000000000..008432e6aa1d2
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -0,0 +1,108 @@
+//===------------- NVPTX implementation of timing utils ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+
+// Returns the overhead associated with calling the profiling region. This
+// allows us to substract the constant-time overhead from the latency to
+// obtain a true result. This can vary with system load.
+[[gnu::noinline]] static uint64_t overhead() {
+ volatile uint32_t x = 1;
+ uint32_t y = x;
+ gpu::sync_threads();
+ uint64_t start = gpu::processor_clock();
+ asm volatile("" ::"r"(y), "llr"(start));
+ uint32_t result = y;
+ asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+ uint64_t stop = gpu::processor_clock();
+ gpu::sync_threads();
+ volatile auto storage = result;
+ return stop - start;
+}
+
+// Stimulate a simple function and obtain its latency in clock cycles on the
+// system. This function cannot be inlined or else it will disturb the very
+// deliccate balance of hard-coded dependencies.
+//
+// FIXME: This does not work in general on NVPTX because of further
+// optimizations ptxas performs. The only way to get consistent results is to
+// pass and extra "SHELL:-Xcuda-ptxas -O0" to CMake's compiler flag. This
+// negatively implacts performance but it is at least stable.
+template <typename F, typename T>
+[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
+ // We need to store the input somewhere to guarantee that the compiler will
+ // not constant propagate it and remove the profiling region.
+ volatile T storage = t;
+ T arg = storage;
+ asm volatile("" ::"r"(arg));
+
+ // Get the current timestamp from the clock.
+ gpu::sync_threads();
+ __nvvm_membar_sys();
+ uint64_t start = gpu::processor_clock();
+
+ // This forces the compiler to load the input argument and run the clock cycle
+ // counter before the profiling region.
+ asm volatile("" ::"r"(arg), "llr"(start));
+
+ // Run the function under test and return its value.
+ auto result = f(arg);
+
+ // This inline assembly performs a no-op which forces the result to both be
+ // used and prevents us from exiting this region before it's complete.
+ asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+
+ // Obtain the current timestamp after running the calculation and force
+ // ordering.
+ uint64_t stop = gpu::processor_clock();
+ __nvvm_membar_sys();
+ gpu::sync_threads();
+ asm volatile("" ::"r"(stop));
+ volatile T output = result;
+
+ // Return the time elapsed.
+ return stop - start;
+}
+
+template <typename F, typename T1, typename T2>
+static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
+ volatile T1 storage = t1;
+ volatile T2 storage2 = t2;
+ T1 arg = storage;
+ T2 arg2 = storage2;
+ asm volatile("" ::"r"(arg), "r"(arg2));
+
+ gpu::sync_threads();
+ uint64_t start = gpu::processor_clock();
+
+ asm volatile("" ::"r"(arg), "r"(arg2), "llr"(start));
+
+ auto result = f(arg, arg2);
+
+ asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+
+ uint64_t stop = gpu::processor_clock();
+ gpu::sync_threads();
+ asm volatile("" ::"r"(stop));
+ volatile auto output = result;
+
+ return stop - start;
+}
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
diff --git a/libc/benchmarks/gpu/timing/timing.h b/libc/benchmarks/gpu/timing/timing.h
new file mode 100644
index 0000000000000..c47bb0d9ebb55
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/timing.h
@@ -0,0 +1,22 @@
+//===------------- Implementation of GPU timing utils -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_H
+#define LLVM_LIBC_UTILS_GPU_TIMING_H
+
+#include "src/__support/macros/properties/architectures.h"
+
+#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+#error "amdgpu not yet supported
+#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
+#include "nvptx/timing.h"
+#else
+#error "unsupported platform"
+#endif
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_H
>From f8291e91be692061ab3d240d78a2112b89cbc342 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 15 May 2024 12:47:40 -0400
Subject: [PATCH 02/16] refactor cmake rules
---
libc/benchmarks/CMakeLists.txt | 397 +++++++++----------
libc/benchmarks/gpu/BenchmarkLogger.cpp | 24 +-
libc/benchmarks/gpu/CMakeLists.txt | 162 +-------
libc/benchmarks/gpu/src/ctype/CMakeLists.txt | 32 +-
libc/benchmarks/gpu/timing/CMakeLists.txt | 14 +-
libc/benchmarks/gpu/timing/timing.h | 2 +-
libc/cmake/modules/LLVMLibCTestRules.cmake | 10 +-
7 files changed, 259 insertions(+), 382 deletions(-)
diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index a802e653a091e..8b51511e3b5cf 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -1,211 +1,210 @@
-if(NOT LIBC_TARGET_OS_IS_GPU)
- find_package(Threads)
-
- set(LLVM_LINK_COMPONENTS
- Support
- TargetParser
- )
-
- #==============================================================================
- # Add Unit Testing Support
- #==============================================================================
-
- function(add_libc_benchmark_unittest target_name)
- if(NOT LLVM_INCLUDE_TESTS)
- return()
- endif()
-
- cmake_parse_arguments(
- "LIBC_BENCHMARKS_UNITTEST"
- "" # No optional arguments
- "SUITE" # Single value arguments
- "SRCS;DEPENDS" # Multi-value arguments
- ${ARGN}
- )
-
- add_executable(${target_name}
- EXCLUDE_FROM_ALL
- ${LIBC_BENCHMARKS_UNITTEST_SRCS}
- )
- target_link_libraries(${target_name}
- PRIVATE
- llvm_gtest_main
- llvm_gtest
- ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
- )
- llvm_update_compile_flags(${target_name})
-
- add_custom_command(
- TARGET ${target_name}
- POST_BUILD
- COMMAND $<TARGET_FILE:${target_name}>
- )
- add_dependencies(libc-benchmark-util-tests ${target_name})
- endfunction()
-
- #==============================================================================
- # Build Google Benchmark for libc
- #==============================================================================
-
- include(ExternalProject)
- ExternalProject_Add(google-benchmark-libc
- EXCLUDE_FROM_ALL ON
- PREFIX google-benchmark-libc
- SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
- INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
- CMAKE_CACHE_ARGS
- -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
- -DBENCHMARK_ENABLE_LTO:BOOL=OFF
- -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
- -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
- -DBENCHMARK_FORCE_WERROR:BOOL=OFF
- -DBENCHMARK_USE_LIBCXX:BOOL=OFF
- -DCMAKE_BUILD_TYPE:STRING=Release
-
- -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
- -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
- -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
- -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
- -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
- -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
-
- -DBUILD_SHARED_LIBS:BOOL=OFF
- -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
-
- -DCMAKE_CXX_STANDARD:STRING=14
- -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
- )
-
- add_custom_target(libc-benchmark-util-tests)
-
- # libc-benchmark
- add_library(libc-benchmark
- STATIC
- EXCLUDE_FROM_ALL
- LibcBenchmark.cpp
- LibcBenchmark.h
- )
+if(LIBC_TARGET_OS_IS_GPU)
+ add_subdirectory(gpu)
+ return()
+endif()
- target_include_directories(libc-benchmark
- PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
- )
- target_link_libraries(libc-benchmark
- PUBLIC
- benchmark::benchmark
- LLVMSupport
- LLVMTargetParser
- Threads::Threads
- )
- add_dependencies(libc-benchmark google-benchmark-libc)
- llvm_update_compile_flags(libc-benchmark)
+find_package(Threads)
- add_libc_benchmark_unittest(libc-benchmark-test
- SRCS LibcBenchmarkTest.cpp
- DEPENDS libc-benchmark
+set(LLVM_LINK_COMPONENTS
+ Support
+ TargetParser
)
- # libc-memory-benchmark
- add_library(libc-memory-benchmark
- STATIC
- EXCLUDE_FROM_ALL
- LibcMemoryBenchmark.cpp
- LibcMemoryBenchmark.h
- LibcFunctionPrototypes.h
- MemorySizeDistributions.cpp
- MemorySizeDistributions.h
- )
- target_include_directories(libc-memory-benchmark
- PUBLIC
- ${CMAKE_CURRENT_SOURCE_DIR}
+#==============================================================================
+# Add Unit Testing Support
+#==============================================================================
+
+function(add_libc_benchmark_unittest target_name)
+ if(NOT LLVM_INCLUDE_TESTS)
+ return()
+ endif()
+
+ cmake_parse_arguments(
+ "LIBC_BENCHMARKS_UNITTEST"
+ "" # No optional arguments
+ "SUITE" # Single value arguments
+ "SRCS;DEPENDS" # Multi-value arguments
+ ${ARGN}
)
- target_link_libraries(libc-memory-benchmark
- PUBLIC
- libc-benchmark
- )
- llvm_update_compile_flags(libc-memory-benchmark)
- add_libc_benchmark_unittest(libc-memory-benchmark-test
- SRCS LibcMemoryBenchmarkTest.cpp
- DEPENDS libc-memory-benchmark
+ add_executable(${target_name}
+ EXCLUDE_FROM_ALL
+ ${LIBC_BENCHMARKS_UNITTEST_SRCS}
)
-
- # json
- add_library(json
- STATIC
- EXCLUDE_FROM_ALL
- JSON.cpp
- JSON.h
+ target_link_libraries(${target_name}
+ PRIVATE
+ llvm_gtest_main
+ llvm_gtest
+ ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
)
- target_link_libraries(json PUBLIC libc-memory-benchmark)
- llvm_update_compile_flags(json)
+ llvm_update_compile_flags(${target_name})
- add_libc_benchmark_unittest(json-test
- SRCS JSONTest.cpp
- DEPENDS json
+ add_custom_command(
+ TARGET ${target_name}
+ POST_BUILD
+ COMMAND $<TARGET_FILE:${target_name}>
)
-
- #==============================================================================
- # Benchmarking tool
- #==============================================================================
-
- # Benchmark all implementations that can run on the target CPU.
- function(add_libc_multi_impl_benchmark name)
- get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
- foreach(fq_config_name IN LISTS fq_implementations)
- get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
- cpu_supports(can_run "${required_cpu_features}")
- if(can_run)
- set(benchmark_name ${fq_config_name}_benchmark)
- add_executable(${benchmark_name}
- EXCLUDE_FROM_ALL
- LibcMemoryBenchmarkMain.cpp
- )
- get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
- target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
- string(TOUPPER ${name} name_upper)
- target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
- llvm_update_compile_flags(${benchmark_name})
- else()
- message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
- endif()
- endforeach()
- endfunction()
-
- add_libc_multi_impl_benchmark(bcmp)
- add_libc_multi_impl_benchmark(bzero)
- add_libc_multi_impl_benchmark(memcmp)
- add_libc_multi_impl_benchmark(memcpy)
- add_libc_multi_impl_benchmark(memmove)
- add_libc_multi_impl_benchmark(memset)
-
- #==============================================================================
- # Google Benchmarking tool
- #==============================================================================
-
- # This target uses the Google Benchmark facility to report throughput for llvm
- # libc memory functions compiled for the host machine. This is useful to
- # continuously monitor the performance of the memory functions.
- add_executable(libc.benchmarks.memory_functions.opt_host
- EXCLUDE_FROM_ALL
- LibcMemoryGoogleBenchmarkMain.cpp
- LibcDefaultImplementations.cpp
- )
- target_link_libraries(libc.benchmarks.memory_functions.opt_host
- PRIVATE
- libc-memory-benchmark
- libc.src.string.memcmp_opt_host.__internal__
- libc.src.string.bcmp_opt_host.__internal__
- libc.src.string.memcpy_opt_host.__internal__
- libc.src.string.memset_opt_host.__internal__
- libc.src.string.bzero_opt_host.__internal__
- libc.src.string.memmove_opt_host.__internal__
- benchmark_main
+ add_dependencies(libc-benchmark-util-tests ${target_name})
+endfunction()
+
+#==============================================================================
+# Build Google Benchmark for libc
+#==============================================================================
+
+include(ExternalProject)
+ExternalProject_Add(google-benchmark-libc
+ EXCLUDE_FROM_ALL ON
+ PREFIX google-benchmark-libc
+ SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
+ INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
+ CMAKE_CACHE_ARGS
+ -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
+ -DBENCHMARK_ENABLE_LTO:BOOL=OFF
+ -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
+ -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
+ -DBENCHMARK_FORCE_WERROR:BOOL=OFF
+ -DBENCHMARK_USE_LIBCXX:BOOL=OFF
+ -DCMAKE_BUILD_TYPE:STRING=Release
+
+ -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
+ -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
+ -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
+ -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
+ -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
+ -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
+
+ -DBUILD_SHARED_LIBS:BOOL=OFF
+ -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
+
+ -DCMAKE_CXX_STANDARD:STRING=14
+ -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
)
- llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host)
-
- add_subdirectory(automemcpy)
-endif()
-if(LIBC_TARGET_OS_IS_GPU)
- add_subdirectory(gpu)
-endif()
+add_custom_target(libc-benchmark-util-tests)
+
+# libc-benchmark
+add_library(libc-benchmark
+ STATIC
+ EXCLUDE_FROM_ALL
+ LibcBenchmark.cpp
+ LibcBenchmark.h
+)
+
+target_include_directories(libc-benchmark
+ PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
+)
+target_link_libraries(libc-benchmark
+ PUBLIC
+ benchmark::benchmark
+ LLVMSupport
+ LLVMTargetParser
+ Threads::Threads
+)
+add_dependencies(libc-benchmark google-benchmark-libc)
+llvm_update_compile_flags(libc-benchmark)
+
+add_libc_benchmark_unittest(libc-benchmark-test
+ SRCS LibcBenchmarkTest.cpp
+ DEPENDS libc-benchmark
+)
+
+# libc-memory-benchmark
+add_library(libc-memory-benchmark
+ STATIC
+ EXCLUDE_FROM_ALL
+ LibcMemoryBenchmark.cpp
+ LibcMemoryBenchmark.h
+ LibcFunctionPrototypes.h
+ MemorySizeDistributions.cpp
+ MemorySizeDistributions.h
+)
+target_include_directories(libc-memory-benchmark
+ PUBLIC
+ ${CMAKE_CURRENT_SOURCE_DIR}
+)
+target_link_libraries(libc-memory-benchmark
+ PUBLIC
+ libc-benchmark
+)
+llvm_update_compile_flags(libc-memory-benchmark)
+
+add_libc_benchmark_unittest(libc-memory-benchmark-test
+ SRCS LibcMemoryBenchmarkTest.cpp
+ DEPENDS libc-memory-benchmark
+)
+
+# json
+add_library(json
+ STATIC
+ EXCLUDE_FROM_ALL
+ JSON.cpp
+ JSON.h
+)
+target_link_libraries(json PUBLIC libc-memory-benchmark)
+llvm_update_compile_flags(json)
+
+add_libc_benchmark_unittest(json-test
+ SRCS JSONTest.cpp
+ DEPENDS json
+)
+
+#==============================================================================
+# Benchmarking tool
+#==============================================================================
+
+# Benchmark all implementations that can run on the target CPU.
+function(add_libc_multi_impl_benchmark name)
+ get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
+ foreach(fq_config_name IN LISTS fq_implementations)
+ get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
+ cpu_supports(can_run "${required_cpu_features}")
+ if(can_run)
+ set(benchmark_name ${fq_config_name}_benchmark)
+ add_executable(${benchmark_name}
+ EXCLUDE_FROM_ALL
+ LibcMemoryBenchmarkMain.cpp
+ )
+ get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
+ target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
+ string(TOUPPER ${name} name_upper)
+ target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
+ llvm_update_compile_flags(${benchmark_name})
+ else()
+ message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
+ endif()
+ endforeach()
+endfunction()
+
+add_libc_multi_impl_benchmark(bcmp)
+add_libc_multi_impl_benchmark(bzero)
+add_libc_multi_impl_benchmark(memcmp)
+add_libc_multi_impl_benchmark(memcpy)
+add_libc_multi_impl_benchmark(memmove)
+add_libc_multi_impl_benchmark(memset)
+
+#==============================================================================
+# Google Benchmarking tool
+#==============================================================================
+
+# This target uses the Google Benchmark facility to report throughput for llvm
+# libc memory functions compiled for the host machine. This is useful to
+# continuously monitor the performance of the memory functions.
+add_executable(libc.benchmarks.memory_functions.opt_host
+ EXCLUDE_FROM_ALL
+ LibcMemoryGoogleBenchmarkMain.cpp
+ LibcDefaultImplementations.cpp
+)
+target_link_libraries(libc.benchmarks.memory_functions.opt_host
+ PRIVATE
+ libc-memory-benchmark
+ libc.src.string.memcmp_opt_host.__internal__
+ libc.src.string.bcmp_opt_host.__internal__
+ libc.src.string.memcpy_opt_host.__internal__
+ libc.src.string.memset_opt_host.__internal__
+ libc.src.string.bzero_opt_host.__internal__
+ libc.src.string.memmove_opt_host.__internal__
+ benchmark_main
+)
+llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host)
+
+add_subdirectory(automemcpy)
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
index 94a0d897c9585..4f70d23a1e95e 100644
--- a/libc/benchmarks/gpu/BenchmarkLogger.cpp
+++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp
@@ -13,18 +13,21 @@ namespace libc_gpu_benchmarks {
// cpp::string_view specialization
template <>
-BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) {
+BenchmarkLogger &
+ BenchmarkLogger::operator<< <cpp::string_view>(cpp::string_view str) {
LIBC_NAMESPACE::write_to_stderr(str);
return *this;
}
// cpp::string specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) {
+template <>
+BenchmarkLogger &BenchmarkLogger::operator<< <cpp::string>(cpp::string str) {
return *this << static_cast<cpp::string_view>(str);
}
// const char* specialization
-template <> BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) {
+template <>
+BenchmarkLogger &BenchmarkLogger::operator<< <const char *>(const char *str) {
return *this << cpp::string_view(str);
}
@@ -66,15 +69,20 @@ template BenchmarkLogger &BenchmarkLogger::operator<< <short>(short);
template BenchmarkLogger &BenchmarkLogger::operator<< <int>(int);
template BenchmarkLogger &BenchmarkLogger::operator<< <long>(long);
template BenchmarkLogger &BenchmarkLogger::operator<< <long long>(long long);
-template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned char>(unsigned char);
-template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned short>(unsigned short);
-template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned int>(unsigned int);
-template BenchmarkLogger &BenchmarkLogger::operator<< <unsigned long>(unsigned long);
+template BenchmarkLogger &
+ BenchmarkLogger::operator<< <unsigned char>(unsigned char);
+template BenchmarkLogger &
+ BenchmarkLogger::operator<< <unsigned short>(unsigned short);
+template BenchmarkLogger &
+ BenchmarkLogger::operator<< <unsigned int>(unsigned int);
+template BenchmarkLogger &
+ BenchmarkLogger::operator<< <unsigned long>(unsigned long);
template BenchmarkLogger &
BenchmarkLogger::operator<< <unsigned long long>(unsigned long long);
#ifdef LIBC_TYPES_HAS_INT128
-template BenchmarkLogger &BenchmarkLogger::operator<< <__uint128_t>(__uint128_t);
+template BenchmarkLogger &
+ BenchmarkLogger::operator<< <__uint128_t>(__uint128_t);
#endif // LIBC_TYPES_HAS_INT128
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<128>>(UInt<128>);
template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<192>>(UInt<192>);
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index a18be27e33573..5dafe66bbd738 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -2,161 +2,25 @@ add_subdirectory(timing)
add_custom_target(gpu-benchmark)
-function (add_gpu_benchmark test_name)
- if(NOT TARGET libc.startup.${LIBC_TARGET_OS}.crt1)
- message(VERBOSE "Skipping ${fq_target_name} as it is not available on ${LIBC_TARGET_OS}.")
- return()
- endif()
-
+function(add_benchmark benchmark_name)
cmake_parse_arguments(
- "GPU_BENCHMARK"
- "" # No optional arguments
- "SUITE" # Single value arguments
- "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
+ "BENCHMARK"
+ "LINK_LIBRARIES" # Optional arguments
+ "" # Single value arguments
+ "" # Multi-value arguments
${ARGN}
)
-
- if(NOT GPU_BENCHMARK_SUITE)
- message(FATAL_ERROR "SUITE not specified for ${fq_target_name}")
- endif()
- if(NOT GPU_BENCHMARK_SRCS)
- message(FATAL_ERROR "The SRCS list for add_gpu_benchmark is missing.")
- endif()
-
- get_fq_target_name(${test_name} fq_target_name)
- get_fq_target_name(${test_name}.libc fq_libc_target_name) # Stores the compiled libc + infrastructure archive to link in
- get_fq_deps_list(fq_deps_list ${GPU_BENCHMARK_DEPENDS})
- list(APPEND fq_deps_list
- # Hermetic tests use the platform's startup object. So, their deps also
- # have to be collected.
- libc.startup.${LIBC_TARGET_OS}.crt1
- # We always add the memory functions objects. This is because the
- # compiler's codegen can emit calls to the C memory functions.
- libc.src.string.bcmp
- libc.src.string.bzero
- libc.src.string.memcmp
- libc.src.string.memcpy
- libc.src.string.memmove
- libc.src.string.memset
- libc.src.__support.StringUtil.error_to_string
- )
-
- list(REMOVE_DUPLICATES fq_deps_list)
-
- # TODO: Instead of gathering internal object files from entrypoints,
- # collect the object files with public names of entrypoints.
- get_object_files_for_test(
- link_object_files skipped_entrypoints_list ${fq_deps_list})
- if(skipped_entrypoints_list)
- if(LIBC_CMAKE_VERBOSE_LOGGING)
- set(msg "Skipping hermetic test ${fq_target_name} as it has missing deps: "
- "${skipped_entrypoints_list}.")
- endif()
- return()
- endif()
- list(REMOVE_DUPLICATES link_object_files)
-
- # Make a library of all deps
- add_library(
- ${fq_target_name}.__libc__
- STATIC
- EXCLUDE_FROM_ALL
- ${link_object_files}
- )
- set_target_properties(${fq_target_name}.__libc__
- PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
- set_target_properties(${fq_target_name}.__libc__
- PROPERTIES ARCHIVE_OUTPUT_NAME ${fq_target_name}.libc)
-
- set(fq_build_target_name ${fq_target_name}.__build__)
- add_executable(
- ${fq_build_target_name}
- EXCLUDE_FROM_ALL
- $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
- ${GPU_BENCHMARK_SRCS}
- ${GPU_BENCHMARK_HDRS}
- )
- set_target_properties(${fq_build_target_name}
- PROPERTIES
- RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
- )
-
- _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
- target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
- target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
- _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}")
- target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
-
- set(link_libraries "")
- foreach(lib in LISTS GPU_BENCHMARK_LINK_LIBRARIES)
- if(TARGET ${lib}.hermetic)
- list(APPEND link_libraries ${lib}.hermetic)
- else()
- list(APPEND link_libraries ${lib})
- endif()
- endforeach()
-
- if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
- target_link_options(${fq_build_target_name} PRIVATE
- ${LIBC_COMPILE_OPTIONS_DEFAULT}
- -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -Wno-multi-gpu
- "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
- "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
- elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
- # We need to use the internal object versions for NVPTX.
- set(internal_suffix ".__internal__")
- target_link_options(${fq_build_target_name} PRIVATE
- ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
- "-Wl,--suppress-stack-size-warning"
- -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
- "--cuda-path=${LIBC_CUDA_ROOT}")
- elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
- target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
- else()
- # Older version of gcc does not support `nostdlib++` flag. We use
- # `nostdlib` and link against libgcc_s, which cannot be linked statically.
- target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib)
- list(APPEND link_libraries ${LIBGCC_S_LOCATION})
- endif()
-
- # link libraries for the BUILD target (i.e. to compile the test)
- target_link_libraries(
- ${fq_build_target_name}
- PRIVATE
- libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
- ${link_libraries}
- # LibcTest.hermetic
+ add_libc_hermetic_test(
+ ${benchmark_name}
+ IS_BENCHMARK
+ LINK_LIBRARIES
LibcGpuBenchmark.hermetic
- # LibcHermeticTestSupport.hermetic
- LibcHermeticTestSupport.hermetic
- # The NVIDIA 'nvlink' linker does not currently support static libraries.
- $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
-
- add_dependencies(${fq_build_target_name}
- LibcGpuBenchmark.hermetic
- ${fq_deps_list})
-
- # Tests on the GPU require an external loader utility to launch the kernel.
- if(TARGET libc.utils.gpu.loader)
- add_dependencies(${fq_build_target_name} libc.utils.gpu.loader)
- get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE")
- endif()
-
- set(test_cmd ${GPU_BENCHMARK_ENV}
- $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${GPU_BENCHMARK_LOADER_ARGS}
- $<TARGET_FILE:${fq_build_target_name}> ${GPU_BENCHMARK_ARGS})
- add_custom_target(
- ${fq_target_name}
- COMMAND ${test_cmd}
- COMMAND_EXPAND_LISTS
- COMMENT "Running GPU benchmark ${fq_target_name}"
+ ${BENCHMARK_LINK_LIBRARIES}
+ ${BENCHMARK_UNPARSED_ARGUMENTS}
)
-
- # Make this benchmark part of its suite
- add_dependencies(${GPU_BENCHMARK_SUITE} ${fq_target_name})
- # Remember to make this benchmark part of the umbrella command
+ get_fq_target_name(${benchmark_name} fq_target_name)
add_dependencies(gpu-benchmark ${fq_target_name})
-endfunction(add_gpu_benchmark)
+endfunction(add_benchmark)
add_unittest_framework_library(
LibcGpuBenchmark
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
index ab2f6cdf0c7fd..8d448b8ced955 100644
--- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -1,21 +1,21 @@
add_custom_target(libc-gpu-ctype-benchmarks)
-add_gpu_benchmark(
- isalnum_benchmark
- SUITE
- libc-gpu-ctype-benchmarks
- SRCS
- isalnum_benchmark.cpp
- DEPENDS
- libc.src.ctype.isalnum
+add_benchmark(
+ isalnum_benchmark
+ SUITE
+ libc-gpu-ctype-benchmarks
+ SRCS
+ isalnum_benchmark.cpp
+ DEPENDS
+ libc.src.ctype.isalnum
)
-add_gpu_benchmark(
- isalpha_benchmark
- SUITE
- libc-gpu-ctype-benchmarks
- SRCS
- isalpha_benchmark.cpp
- DEPENDS
- libc.src.ctype.isalpha
+add_benchmark(
+ isalpha_benchmark
+ SUITE
+ libc-gpu-ctype-benchmarks
+ SRCS
+ isalpha_benchmark.cpp
+ DEPENDS
+ libc.src.ctype.isalpha
)
diff --git a/libc/benchmarks/gpu/timing/CMakeLists.txt b/libc/benchmarks/gpu/timing/CMakeLists.txt
index 0e6a5a6b47968..8bbc7e33f122a 100644
--- a/libc/benchmarks/gpu/timing/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/CMakeLists.txt
@@ -1,12 +1,12 @@
foreach(target nvptx)
- add_subdirectory(${target})
- list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing)
+ add_subdirectory(${target})
+ list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing)
endforeach()
add_header_library(
- timing
- HDRS
- timing.h
- DEPENDS
- ${target_gpu_timing}
+ timing
+ HDRS
+ timing.h
+ DEPENDS
+ ${target_gpu_timing}
)
diff --git a/libc/benchmarks/gpu/timing/timing.h b/libc/benchmarks/gpu/timing/timing.h
index c47bb0d9ebb55..180ea77954ae5 100644
--- a/libc/benchmarks/gpu/timing/timing.h
+++ b/libc/benchmarks/gpu/timing/timing.h
@@ -12,7 +12,7 @@
#include "src/__support/macros/properties/architectures.h"
#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
-#error "amdgpu not yet supported
+#error "amdgpu not yet supported"
#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
#include "nvptx/timing.h"
#else
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index c8d7c8a2b1c7c..278137774e089 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -550,7 +550,7 @@ function(add_libc_hermetic_test test_name)
endif()
cmake_parse_arguments(
"HERMETIC_TEST"
- "" # No optional arguments
+ "IS_BENCHMARK" # Optional arguments
"SUITE" # Single value arguments
"SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
${ARGN}
@@ -651,6 +651,13 @@ function(add_libc_hermetic_test test_name)
endif()
endforeach()
+ # Benchmarks requires a separate library with a different `main` function
+ if(HERMETIC_TEST_IS_BENCHMARK)
+ list(APPEND link_libraries LibcGpuBenchmark.hermetic)
+ else()
+ list(APPEND link_libraries LibcTest.hermetic)
+ endif()
+
if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
target_link_options(${fq_build_target_name} PRIVATE
${LIBC_COMPILE_OPTIONS_DEFAULT}
@@ -678,7 +685,6 @@ function(add_libc_hermetic_test test_name)
PRIVATE
libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
${link_libraries}
- LibcTest.hermetic
LibcHermeticTestSupport.hermetic
# The NVIDIA 'nvlink' linker does not currently support static libraries.
$<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
>From 1129ccc33651c46ec22c6cd3d679abbb1829b3ba Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Thu, 16 May 2024 13:08:37 -0400
Subject: [PATCH 03/16] fix code style
---
libc/benchmarks/gpu/CMakeLists.txt | 1 +
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 87 ++++++++-------
libc/benchmarks/gpu/LibcGpuBenchmark.h | 105 +++++++++----------
libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp | 2 +-
4 files changed, 97 insertions(+), 98 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 5dafe66bbd738..db2953f6fcf23 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -37,6 +37,7 @@ add_unittest_framework_library(
libc.src.__support.CPP.string
libc.src.__support.CPP.string_view
libc.src.__support.CPP.type_traits
+ libc.src.__support.CPP.functional
libc.src.__support.fixed_point.fx_rep
libc.src.__support.macros.properties.types
libc.src.__support.OSUtil.osutil
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index d37f5a0a53a70..087b59689d90b 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -3,67 +3,66 @@
namespace LIBC_NAMESPACE {
namespace libc_gpu_benchmarks {
-Benchmark *Benchmark::Start = nullptr;
-Benchmark *Benchmark::End = nullptr;
+Benchmark *Benchmark::start = nullptr;
+Benchmark *Benchmark::end = nullptr;
-void Benchmark::addBenchmark(Benchmark *B) {
- if (End == nullptr) {
- Start = B;
- End = B;
+void Benchmark::add_benchmark(Benchmark *benchmark) {
+ if (end == nullptr) {
+ start = benchmark;
+ end = benchmark;
return;
}
-
- End->Next = B;
- End = B;
+ end->next = benchmark;
+ end = benchmark;
}
-int Benchmark::runBenchmarks() {
- for (Benchmark *B = Start; B != nullptr; B = B->Next) {
- B->Run();
- }
-
+int Benchmark::run_benchmarks() {
+ for (Benchmark *b = start; b != nullptr; b = b->next)
+ b->run();
return 0;
}
-BenchmarkResult benchmark(const BenchmarkOptions &Options,
- uint64_t (*WrapperFunc)()) {
- BenchmarkResult Result;
- RuntimeEstimationProgression REP;
- size_t TotalIterations = 0;
- size_t Iterations = Options.InitialIterations;
- if (Iterations < (uint32_t)1) {
- Iterations = 1;
- }
- size_t Samples = 0;
- uint64_t BestGuess = 0;
- uint64_t TotalCycles = 0;
+BenchmarkResult benchmark(const BenchmarkOptions &options,
+ cpp::function<uint64_t(void)> wrapper_func) {
+ BenchmarkResult result;
+ RuntimeEstimationProgression rep;
+ size_t total_iterations = 0;
+ size_t iterations = options.initial_iterations;
+ if (iterations < (uint32_t)1)
+ iterations = 1;
+
+ size_t samples = 0;
+ uint64_t best_guess = 0;
+ uint64_t total_cycles = 0;
for (;;) {
- uint64_t SampleCycles = 0;
- for (uint32_t i = 0; i < Iterations; i++) {
- auto overhead = LIBC_NAMESPACE::overhead();
- uint64_t result = WrapperFunc() - overhead;
- SampleCycles += result;
+ uint64_t sample_cycles = 0;
+ uint64_t overhead = LIBC_NAMESPACE::overhead();
+ for (uint32_t i = 0; i < iterations; i++) {
+ uint64_t result = wrapper_func() - overhead;
+ sample_cycles += result;
}
- Samples++;
- TotalCycles += SampleCycles;
- TotalIterations += Iterations;
- const double ChangeRatio =
- REP.ComputeImprovement({Iterations, SampleCycles});
- BestGuess = REP.CurrentEstimation;
+ samples++;
+ total_cycles += sample_cycles;
+ total_iterations += iterations;
+ const double change_ratio =
+ rep.compute_improvement({iterations, sample_cycles});
+ best_guess = rep.current_estimation;
- if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) {
+ if (samples >= options.max_samples ||
+ iterations >= options.max_iterations) {
break;
- } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) {
+ } else if (samples >= options.min_samples &&
+ change_ratio < options.epsilon) {
break;
}
- Iterations *= Options.ScalingFactor;
+ iterations *= options.scaling_factor;
}
- Result.Cycles = BestGuess;
- Result.Samples = Samples;
- Result.TotalIterations = TotalIterations;
- return Result;
+ result.cycles = best_guess;
+ result.samples = samples;
+ result.total_iterations = total_iterations;
+ return result;
};
} // namespace libc_gpu_benchmarks
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index ccbbe3629dbda..3d762631f2d96 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -1,9 +1,10 @@
#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
-#include "benchmarks/gpu/timing/timing.h"
-
#include "benchmarks/gpu/BenchmarkLogger.h"
+#include "benchmarks/gpu/timing/timing.h"
+#include "src/__support/CPP/functional.h"
+#include "src/__support/CPP/string_view.h"
#include <stddef.h>
#include <stdint.h>
@@ -13,104 +14,102 @@ namespace LIBC_NAMESPACE {
namespace libc_gpu_benchmarks {
struct BenchmarkOptions {
- uint32_t InitialIterations = 1;
- uint32_t MaxIterations = 10000000;
- uint32_t MinSamples = 4;
- uint32_t MaxSamples = 1000;
- double Epsilon = 0.01;
- double ScalingFactor = 1.4;
+ uint32_t initial_iterations = 1;
+ uint32_t max_iterations = 10000000;
+ uint32_t min_samples = 4;
+ uint32_t max_samples = 1000;
+ double epsilon = 0.01;
+ double scaling_factor = 1.4;
};
struct Measurement {
- size_t Iterations = 0;
- uint64_t ElapsedCycles = 0;
+ size_t iterations = 0;
+ uint64_t elapsed_cycles = 0;
};
class RefinableRuntimeEstimation {
- uint64_t TotalCycles = 0;
- size_t TotalIterations = 0;
+ uint64_t total_cycles = 0;
+ size_t total_iterations = 0;
public:
- uint64_t Update(const Measurement &M) {
- TotalCycles += M.ElapsedCycles;
- TotalIterations += M.Iterations;
- return TotalCycles / TotalIterations;
+ uint64_t update(const Measurement &M) {
+ total_cycles += M.elapsed_cycles;
+ total_iterations += M.iterations;
+ return total_cycles / total_iterations;
}
};
// Tracks the progression of the runtime estimation
class RuntimeEstimationProgression {
- RefinableRuntimeEstimation RRE;
+ RefinableRuntimeEstimation rre;
public:
- uint64_t CurrentEstimation = 0;
+ uint64_t current_estimation = 0;
- double ComputeImprovement(const Measurement &M) {
- const uint64_t NewEstimation = RRE.Update(M);
- double Ratio = ((double)CurrentEstimation / NewEstimation) - 1.0;
+ double compute_improvement(const Measurement &M) {
+ const uint64_t new_estimation = rre.update(M);
+ double ratio = ((double)current_estimation / new_estimation) - 1.0;
// Get absolute value
- if (Ratio < 0) {
- Ratio *= -1;
- }
+ if (ratio < 0)
+ ratio *= -1;
- CurrentEstimation = NewEstimation;
- return Ratio;
+ current_estimation = new_estimation;
+ return ratio;
}
};
struct BenchmarkResult {
- uint64_t Cycles = 0;
- size_t Samples = 0;
- size_t TotalIterations = 0;
+ uint64_t cycles = 0;
+ size_t samples = 0;
+ size_t total_iterations = 0;
};
-BenchmarkResult benchmark(const BenchmarkOptions &Options,
- uint64_t (*WrapperFunc)());
+BenchmarkResult benchmark(const BenchmarkOptions &options,
+ cpp::function<uint64_t(void)> wrapper_func);
class Benchmark {
- Benchmark *Next = nullptr;
+ Benchmark *next = nullptr;
public:
virtual ~Benchmark() {}
- virtual void SetUp() {}
- virtual void TearDown() {}
+ virtual void set_up() {}
+ virtual void tear_down() {}
- static int runBenchmarks();
+ static int run_benchmarks();
protected:
- static void addBenchmark(Benchmark *);
+ static void add_benchmark(Benchmark *);
private:
- virtual void Run() = 0;
- virtual const char *getName() const = 0;
+ virtual void run() = 0;
+ virtual const cpp::string_view get_name() const = 0;
- static Benchmark *Start;
- static Benchmark *End;
+ static Benchmark *start;
+ static Benchmark *end;
};
class WrapperBenchmark : public Benchmark {
- using BenchmarkWrapperFunction = uint64_t (*)();
- BenchmarkWrapperFunction Func;
- const char *Name;
+ const cpp::function<uint64_t(void)> func;
+ const cpp::string_view name;
public:
- WrapperBenchmark(BenchmarkWrapperFunction Func, char const *Name)
- : Func(Func), Name(Name) {
- addBenchmark(this);
+ WrapperBenchmark(cpp::function<uint64_t(void)> func, char const *name)
+ : func(func), name(name) {
+ add_benchmark(this);
}
private:
- void Run() override {
- BenchmarkOptions Options;
- auto result = benchmark(Options, Func);
+ void run() override {
+ BenchmarkOptions options;
+ auto result = benchmark(options, func);
constexpr auto GREEN = "\033[32m";
constexpr auto RESET = "\033[0m";
- blog << GREEN << "[ RUN ] " << RESET << Name << '\n';
- blog << GREEN << "[ OK ] " << RESET << Name << ": " << result.Cycles
- << " cycles, " << result.TotalIterations << " iterations\n";
+ blog << GREEN << "[ RUN ] " << RESET << name << '\n';
+ blog << GREEN << "[ OK ] " << RESET << name << ": " << result.cycles
+ << " cycles, " << result.total_iterations << " iterations\n";
}
- const char *getName() const override { return Name; }
+ const cpp::string_view get_name() const override { return name; }
};
} // namespace libc_gpu_benchmarks
} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
index c971b00cc9a1b..510fd13210494 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
@@ -1,6 +1,6 @@
#include "LibcGpuBenchmark.h"
extern "C" int main(int argc, char **argv, char **envp) {
- LIBC_NAMESPACE::libc_gpu_benchmarks::Benchmark::runBenchmarks();
+ LIBC_NAMESPACE::libc_gpu_benchmarks::Benchmark::run_benchmarks();
return 0;
}
>From 5c46009bbbddd2114a11fabd2e3afbebed7488f7 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Fri, 17 May 2024 16:22:23 -0400
Subject: [PATCH 04/16] measure walltime, standard deviation, min, and max
---
libc/benchmarks/gpu/CMakeLists.txt | 7 +++++
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 31 ++++++++++++++++---
libc/benchmarks/gpu/LibcGpuBenchmark.h | 12 ++++++-
.../gpu/src/ctype/isalnum_benchmark.cpp | 13 --------
libc/benchmarks/gpu/timing/nvptx/timing.h | 13 +++-----
5 files changed, 50 insertions(+), 26 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index db2953f6fcf23..b9ca85393cc2e 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -10,6 +10,9 @@ function(add_benchmark benchmark_name)
"" # Multi-value arguments
${ARGN}
)
+ if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
+ message(FATAL_ERROR "target does not support clock")
+ endif()
add_libc_hermetic_test(
${benchmark_name}
IS_BENCHMARK
@@ -38,10 +41,14 @@ add_unittest_framework_library(
libc.src.__support.CPP.string_view
libc.src.__support.CPP.type_traits
libc.src.__support.CPP.functional
+ libc.src.__support.CPP.limits
+ libc.src.__support.CPP.algorithm
libc.src.__support.fixed_point.fx_rep
libc.src.__support.macros.properties.types
libc.src.__support.OSUtil.osutil
libc.src.__support.uint128
+ libc.src.___support.FPUtil.sqrt
+ libc.src.time.clock
libc.benchmarks.gpu.timing.timing
)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 087b59689d90b..3ecff18884b34 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -1,4 +1,7 @@
#include "LibcGpuBenchmark.h"
+#include "src/__support/CPP/algorithm.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/time/gpu/time_utils.h"
namespace LIBC_NAMESPACE {
namespace libc_gpu_benchmarks {
@@ -32,27 +35,42 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
iterations = 1;
size_t samples = 0;
+ uint64_t total_time = 0;
uint64_t best_guess = 0;
uint64_t total_cycles = 0;
+ uint64_t cycles_2 = 0;
+ uint64_t min = UINT_MAX;
+ uint64_t max = 0;
for (;;) {
uint64_t sample_cycles = 0;
uint64_t overhead = LIBC_NAMESPACE::overhead();
+ const clock_t start = (double)clock();
for (uint32_t i = 0; i < iterations; i++) {
- uint64_t result = wrapper_func() - overhead;
+ auto wrapper_intermediate = wrapper_func();
+ uint64_t result = wrapper_intermediate - overhead;
+ max = cpp::max(max, result);
+ min = cpp::min(min, result);
sample_cycles += result;
}
-
+ const clock_t end = clock();
+ const clock_t duration_ns =
+ ((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
+ total_time += duration_ns;
samples++;
total_cycles += sample_cycles;
+ cycles_2 += sample_cycles * sample_cycles;
+
total_iterations += iterations;
const double change_ratio =
rep.compute_improvement({iterations, sample_cycles});
best_guess = rep.current_estimation;
if (samples >= options.max_samples ||
- iterations >= options.max_iterations) {
+ iterations >= options.max_iterations ||
+ total_time >= options.max_duration) {
break;
- } else if (samples >= options.min_samples &&
+ } else if (total_time >= options.min_duration &&
+ samples >= options.min_samples &&
change_ratio < options.epsilon) {
break;
}
@@ -60,8 +78,13 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
iterations *= options.scaling_factor;
}
result.cycles = best_guess;
+ result.standard_deviation = fputil::sqrt((double)cycles_2 / total_iterations -
+ (best_guess * best_guess));
+ result.min = min;
+ result.max = max;
result.samples = samples;
result.total_iterations = total_iterations;
+ result.total_time = total_time;
return result;
};
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 3d762631f2d96..798ae06086b1a 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -4,7 +4,9 @@
#include "benchmarks/gpu/BenchmarkLogger.h"
#include "benchmarks/gpu/timing/timing.h"
#include "src/__support/CPP/functional.h"
+#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
+#include "src/time/clock.h"
#include <stddef.h>
#include <stdint.h>
@@ -18,6 +20,8 @@ struct BenchmarkOptions {
uint32_t max_iterations = 10000000;
uint32_t min_samples = 4;
uint32_t max_samples = 1000;
+ uint64_t min_duration = 0; // in nanoseconds (ns)
+ uint64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second
double epsilon = 0.01;
double scaling_factor = 1.4;
};
@@ -61,8 +65,12 @@ class RuntimeEstimationProgression {
struct BenchmarkResult {
uint64_t cycles = 0;
+ double standard_deviation = 0;
+ uint64_t min = UINT_MAX;
+ uint64_t max = 0;
size_t samples = 0;
size_t total_iterations = 0;
+ clock_t total_time = 0;
};
BenchmarkResult benchmark(const BenchmarkOptions &options,
@@ -107,7 +115,9 @@ class WrapperBenchmark : public Benchmark {
constexpr auto RESET = "\033[0m";
blog << GREEN << "[ RUN ] " << RESET << name << '\n';
blog << GREEN << "[ OK ] " << RESET << name << ": " << result.cycles
- << " cycles, " << result.total_iterations << " iterations\n";
+ << " cycles, " << result.min << " min, " << result.max << " max, "
+ << result.total_iterations << " iterations, " << result.total_time
+ << " ns, " << (long)result.standard_deviation << " stddev\n";
}
const cpp::string_view get_name() const override { return name; }
};
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
index 8d9c958bb7ed4..4050bc0ec77b9 100644
--- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -7,16 +7,3 @@ uint64_t BM_IsAlnum() {
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
}
BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper, BM_IsAlnum);
-
-[[gnu::noinline]] static uint64_t single_input_function(int x) {
- asm volatile("" ::"r"(x)); // prevent the compiler from optimizing out x
- return x;
-}
-
-uint64_t BM_IsAlnumWithOverhead() {
- char x = 'c';
- return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x) -
- LIBC_NAMESPACE::latency(single_input_function, 0);
-}
-BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWithOverhead,
- BM_IsAlnumWithOverhead);
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 008432e6aa1d2..001bdd3686062 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -37,12 +37,7 @@ namespace LIBC_NAMESPACE {
// Stimulate a simple function and obtain its latency in clock cycles on the
// system. This function cannot be inlined or else it will disturb the very
-// deliccate balance of hard-coded dependencies.
-//
-// FIXME: This does not work in general on NVPTX because of further
-// optimizations ptxas performs. The only way to get consistent results is to
-// pass and extra "SHELL:-Xcuda-ptxas -O0" to CMake's compiler flag. This
-// negatively implacts performance but it is at least stable.
+// delicate balance of hard-coded dependencies.
template <typename F, typename T>
[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
// We need to store the input somewhere to guarantee that the compiler will
@@ -53,7 +48,7 @@ template <typename F, typename T>
// Get the current timestamp from the clock.
gpu::sync_threads();
- __nvvm_membar_sys();
+ gpu::memory_fence();
uint64_t start = gpu::processor_clock();
// This forces the compiler to load the input argument and run the clock cycle
@@ -70,7 +65,7 @@ template <typename F, typename T>
// Obtain the current timestamp after running the calculation and force
// ordering.
uint64_t stop = gpu::processor_clock();
- __nvvm_membar_sys();
+ gpu::memory_fence();
gpu::sync_threads();
asm volatile("" ::"r"(stop));
volatile T output = result;
@@ -88,6 +83,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
asm volatile("" ::"r"(arg), "r"(arg2));
gpu::sync_threads();
+ gpu::memory_fence();
uint64_t start = gpu::processor_clock();
asm volatile("" ::"r"(arg), "r"(arg2), "llr"(start));
@@ -97,6 +93,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
uint64_t stop = gpu::processor_clock();
+ gpu::memory_fence();
gpu::sync_threads();
asm volatile("" ::"r"(stop));
volatile auto output = result;
>From e50ea99befc4279ea1987c47cf5084a55f2f8a47 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 22 May 2024 16:08:39 -0400
Subject: [PATCH 05/16] fixed vector for benchmarks
---
libc/benchmarks/CMakeLists.txt | 114 ++++++++++++-----------
libc/benchmarks/gpu/CMakeLists.txt | 1 +
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 16 +---
libc/benchmarks/gpu/LibcGpuBenchmark.h | 4 +-
4 files changed, 65 insertions(+), 70 deletions(-)
diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 8b51511e3b5cf..221d4e11d383d 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -1,14 +1,14 @@
if(LIBC_TARGET_OS_IS_GPU)
- add_subdirectory(gpu)
- return()
+ add_subdirectory(gpu)
+ return()
endif()
find_package(Threads)
set(LLVM_LINK_COMPONENTS
- Support
- TargetParser
- )
+ Support
+ TargetParser
+)
#==============================================================================
# Add Unit Testing Support
@@ -16,35 +16,37 @@ set(LLVM_LINK_COMPONENTS
function(add_libc_benchmark_unittest target_name)
if(NOT LLVM_INCLUDE_TESTS)
- return()
+ return()
endif()
- cmake_parse_arguments(
- "LIBC_BENCHMARKS_UNITTEST"
- "" # No optional arguments
- "SUITE" # Single value arguments
- "SRCS;DEPENDS" # Multi-value arguments
- ${ARGN}
+ cmake_parse_arguments(if(LIBC_TARGET_OS_IS_GPU)
+ add_subdirectory(gpu)
+ return()
+ "LIBC_BENCHMARKS_UNITTEST"
+ "" # No optional arguments
+ "SUITE" # Single value arguments
+ "SRCS;DEPENDS" # Multi-value arguments
+ ${ARGN}
)
- add_executable(${target_name}
- EXCLUDE_FROM_ALL
- ${LIBC_BENCHMARKS_UNITTEST_SRCS}
- )
- target_link_libraries(${target_name}
- PRIVATE
- llvm_gtest_main
- llvm_gtest
- ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
- )
- llvm_update_compile_flags(${target_name})
-
- add_custom_command(
- TARGET ${target_name}
- POST_BUILD
- COMMAND $<TARGET_FILE:${target_name}>
- )
- add_dependencies(libc-benchmark-util-tests ${target_name})
+ add_executable(${target_name}
+ EXCLUDE_FROM_ALL
+ ${LIBC_BENCHMARKS_UNITTEST_SRCS}
+ )
+ target_link_libraries(${target_name}
+ PRIVATE
+ llvm_gtest_main
+ llvm_gtest
+ ${LIBC_BENCHMARKS_UNITTEST_DEPENDS}
+ )
+ llvm_update_compile_flags(${target_name})
+
+ add_custom_command(
+ TARGET ${target_name}
+ POST_BUILD
+ COMMAND $<TARGET_FILE:${target_name}>
+ )
+ add_dependencies(libc-benchmark-util-tests ${target_name})
endfunction()
#==============================================================================
@@ -53,32 +55,32 @@ endfunction()
include(ExternalProject)
ExternalProject_Add(google-benchmark-libc
- EXCLUDE_FROM_ALL ON
- PREFIX google-benchmark-libc
- SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
- INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
- CMAKE_CACHE_ARGS
- -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
- -DBENCHMARK_ENABLE_LTO:BOOL=OFF
- -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
- -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
- -DBENCHMARK_FORCE_WERROR:BOOL=OFF
- -DBENCHMARK_USE_LIBCXX:BOOL=OFF
- -DCMAKE_BUILD_TYPE:STRING=Release
-
- -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
- -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
- -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
- -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
- -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
- -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
-
- -DBUILD_SHARED_LIBS:BOOL=OFF
- -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
-
- -DCMAKE_CXX_STANDARD:STRING=14
- -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
- )
+ EXCLUDE_FROM_ALL ON
+ PREFIX google-benchmark-libc
+ SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
+ INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
+ CMAKE_CACHE_ARGS
+ -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
+ -DBENCHMARK_ENABLE_LTO:BOOL=OFF
+ -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
+ -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
+ -DBENCHMARK_FORCE_WERROR:BOOL=OFF
+ -DBENCHMARK_USE_LIBCXX:BOOL=OFF
+ -DCMAKE_BUILD_TYPE:STRING=Release
+
+ -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
+ -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
+ -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
+ -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
+ -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
+ -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
+
+ -DBUILD_SHARED_LIBS:BOOL=OFF
+ -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
+
+ -DCMAKE_CXX_STANDARD:STRING=14
+ -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
+ )
add_custom_target(libc-benchmark-util-tests)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index b9ca85393cc2e..51fc267df807d 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -48,6 +48,7 @@ add_unittest_framework_library(
libc.src.__support.OSUtil.osutil
libc.src.__support.uint128
libc.src.___support.FPUtil.sqrt
+ libc.src.__support.fixedvector
libc.src.time.clock
libc.benchmarks.gpu.timing.timing
)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 3ecff18884b34..f8021c873242f 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -6,22 +6,16 @@
namespace LIBC_NAMESPACE {
namespace libc_gpu_benchmarks {
-Benchmark *Benchmark::start = nullptr;
-Benchmark *Benchmark::end = nullptr;
+FixedVector<Benchmark *, 64> benchmarks_to_run;
void Benchmark::add_benchmark(Benchmark *benchmark) {
- if (end == nullptr) {
- start = benchmark;
- end = benchmark;
- return;
- }
- end->next = benchmark;
- end = benchmark;
+ benchmarks_to_run.push_back(benchmark);
}
int Benchmark::run_benchmarks() {
- for (Benchmark *b = start; b != nullptr; b = b->next)
- b->run();
+ for (auto it = benchmarks_to_run.rbegin(), e = benchmarks_to_run.rend();
+ it != e; ++it)
+ (*it)->run();
return 0;
}
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 798ae06086b1a..459e4d9b6ea98 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -6,6 +6,7 @@
#include "src/__support/CPP/functional.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
+#include "src/__support/fixedvector.h"
#include "src/time/clock.h"
#include <stddef.h>
@@ -92,9 +93,6 @@ class Benchmark {
private:
virtual void run() = 0;
virtual const cpp::string_view get_name() const = 0;
-
- static Benchmark *start;
- static Benchmark *end;
};
class WrapperBenchmark : public Benchmark {
>From a588fc5b2eac6e84dd0dc4f62bebfc428a695845 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 22 May 2024 16:27:44 -0400
Subject: [PATCH 06/16] refactor cmake files
---
libc/benchmarks/CMakeLists.txt | 188 +++++++++----------
libc/benchmarks/gpu/CMakeLists.txt | 3 +-
libc/benchmarks/gpu/src/ctype/CMakeLists.txt | 28 +--
libc/cmake/modules/LLVMLibCTestRules.cmake | 27 +--
4 files changed, 122 insertions(+), 124 deletions(-)
diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 221d4e11d383d..0234ccb2a7a78 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -8,26 +8,24 @@ find_package(Threads)
set(LLVM_LINK_COMPONENTS
Support
TargetParser
-)
+ )
#==============================================================================
# Add Unit Testing Support
#==============================================================================
function(add_libc_benchmark_unittest target_name)
- if(NOT LLVM_INCLUDE_TESTS)
+ if(NOT LLVM_INCLUDE_TESTS)
return()
- endif()
+ endif()
- cmake_parse_arguments(if(LIBC_TARGET_OS_IS_GPU)
- add_subdirectory(gpu)
- return()
+ cmake_parse_arguments(
"LIBC_BENCHMARKS_UNITTEST"
"" # No optional arguments
"SUITE" # Single value arguments
"SRCS;DEPENDS" # Multi-value arguments
${ARGN}
- )
+ )
add_executable(${target_name}
EXCLUDE_FROM_ALL
@@ -55,99 +53,99 @@ endfunction()
include(ExternalProject)
ExternalProject_Add(google-benchmark-libc
- EXCLUDE_FROM_ALL ON
- PREFIX google-benchmark-libc
- SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
- INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
- CMAKE_CACHE_ARGS
- -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
- -DBENCHMARK_ENABLE_LTO:BOOL=OFF
- -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
- -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
- -DBENCHMARK_FORCE_WERROR:BOOL=OFF
- -DBENCHMARK_USE_LIBCXX:BOOL=OFF
- -DCMAKE_BUILD_TYPE:STRING=Release
-
- -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
- -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
- -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
- -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
- -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
- -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
-
- -DBUILD_SHARED_LIBS:BOOL=OFF
- -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
-
- -DCMAKE_CXX_STANDARD:STRING=14
- -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
- )
+ EXCLUDE_FROM_ALL ON
+ PREFIX google-benchmark-libc
+ SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark
+ INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc
+ CMAKE_CACHE_ARGS
+ -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF
+ -DBENCHMARK_ENABLE_LTO:BOOL=OFF
+ -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
+ -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR}
+ -DBENCHMARK_FORCE_WERROR:BOOL=OFF
+ -DBENCHMARK_USE_LIBCXX:BOOL=OFF
+ -DCMAKE_BUILD_TYPE:STRING=Release
+
+ -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME}
+ -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR}
+ -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
+ -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
+ -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
+ -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH}
+
+ -DBUILD_SHARED_LIBS:BOOL=OFF
+ -DCMAKE_EXE_LINKER_FLAGS:STRING=-static
+
+ -DCMAKE_CXX_STANDARD:STRING=14
+ -DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>
+ )
add_custom_target(libc-benchmark-util-tests)
# libc-benchmark
add_library(libc-benchmark
- STATIC
- EXCLUDE_FROM_ALL
- LibcBenchmark.cpp
- LibcBenchmark.h
+ STATIC
+ EXCLUDE_FROM_ALL
+ LibcBenchmark.cpp
+ LibcBenchmark.h
)
target_include_directories(libc-benchmark
- PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
+ PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR}
)
target_link_libraries(libc-benchmark
- PUBLIC
- benchmark::benchmark
- LLVMSupport
- LLVMTargetParser
- Threads::Threads
+ PUBLIC
+ benchmark::benchmark
+ LLVMSupport
+ LLVMTargetParser
+ Threads::Threads
)
add_dependencies(libc-benchmark google-benchmark-libc)
llvm_update_compile_flags(libc-benchmark)
add_libc_benchmark_unittest(libc-benchmark-test
- SRCS LibcBenchmarkTest.cpp
- DEPENDS libc-benchmark
+ SRCS LibcBenchmarkTest.cpp
+ DEPENDS libc-benchmark
)
# libc-memory-benchmark
add_library(libc-memory-benchmark
- STATIC
- EXCLUDE_FROM_ALL
- LibcMemoryBenchmark.cpp
- LibcMemoryBenchmark.h
- LibcFunctionPrototypes.h
- MemorySizeDistributions.cpp
- MemorySizeDistributions.h
+ STATIC
+ EXCLUDE_FROM_ALL
+ LibcMemoryBenchmark.cpp
+ LibcMemoryBenchmark.h
+ LibcFunctionPrototypes.h
+ MemorySizeDistributions.cpp
+ MemorySizeDistributions.h
)
target_include_directories(libc-memory-benchmark
- PUBLIC
- ${CMAKE_CURRENT_SOURCE_DIR}
+ PUBLIC
+ ${CMAKE_CURRENT_SOURCE_DIR}
)
target_link_libraries(libc-memory-benchmark
- PUBLIC
- libc-benchmark
+ PUBLIC
+ libc-benchmark
)
llvm_update_compile_flags(libc-memory-benchmark)
add_libc_benchmark_unittest(libc-memory-benchmark-test
- SRCS LibcMemoryBenchmarkTest.cpp
- DEPENDS libc-memory-benchmark
+ SRCS LibcMemoryBenchmarkTest.cpp
+ DEPENDS libc-memory-benchmark
)
# json
add_library(json
- STATIC
- EXCLUDE_FROM_ALL
- JSON.cpp
- JSON.h
+ STATIC
+ EXCLUDE_FROM_ALL
+ JSON.cpp
+ JSON.h
)
target_link_libraries(json PUBLIC libc-memory-benchmark)
llvm_update_compile_flags(json)
add_libc_benchmark_unittest(json-test
- SRCS JSONTest.cpp
- DEPENDS json
+ SRCS JSONTest.cpp
+ DEPENDS json
)
#==============================================================================
@@ -156,25 +154,25 @@ add_libc_benchmark_unittest(json-test
# Benchmark all implementations that can run on the target CPU.
function(add_libc_multi_impl_benchmark name)
- get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
- foreach(fq_config_name IN LISTS fq_implementations)
- get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
- cpu_supports(can_run "${required_cpu_features}")
- if(can_run)
- set(benchmark_name ${fq_config_name}_benchmark)
- add_executable(${benchmark_name}
- EXCLUDE_FROM_ALL
- LibcMemoryBenchmarkMain.cpp
- )
- get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
- target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
- string(TOUPPER ${name} name_upper)
- target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
- llvm_update_compile_flags(${benchmark_name})
- else()
- message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
- endif()
- endforeach()
+ get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations)
+ foreach(fq_config_name IN LISTS fq_implementations)
+ get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES)
+ cpu_supports(can_run "${required_cpu_features}")
+ if(can_run)
+ set(benchmark_name ${fq_config_name}_benchmark)
+ add_executable(${benchmark_name}
+ EXCLUDE_FROM_ALL
+ LibcMemoryBenchmarkMain.cpp
+ )
+ get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW")
+ target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file})
+ string(TOUPPER ${name} name_upper)
+ target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"")
+ llvm_update_compile_flags(${benchmark_name})
+ else()
+ message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
+ endif()
+ endforeach()
endfunction()
add_libc_multi_impl_benchmark(bcmp)
@@ -192,20 +190,20 @@ add_libc_multi_impl_benchmark(memset)
# libc memory functions compiled for the host machine. This is useful to
# continuously monitor the performance of the memory functions.
add_executable(libc.benchmarks.memory_functions.opt_host
- EXCLUDE_FROM_ALL
- LibcMemoryGoogleBenchmarkMain.cpp
- LibcDefaultImplementations.cpp
+ EXCLUDE_FROM_ALL
+ LibcMemoryGoogleBenchmarkMain.cpp
+ LibcDefaultImplementations.cpp
)
target_link_libraries(libc.benchmarks.memory_functions.opt_host
- PRIVATE
- libc-memory-benchmark
- libc.src.string.memcmp_opt_host.__internal__
- libc.src.string.bcmp_opt_host.__internal__
- libc.src.string.memcpy_opt_host.__internal__
- libc.src.string.memset_opt_host.__internal__
- libc.src.string.bzero_opt_host.__internal__
- libc.src.string.memmove_opt_host.__internal__
- benchmark_main
+ PRIVATE
+ libc-memory-benchmark
+ libc.src.string.memcmp_opt_host.__internal__
+ libc.src.string.bcmp_opt_host.__internal__
+ libc.src.string.memcpy_opt_host.__internal__
+ libc.src.string.memset_opt_host.__internal__
+ libc.src.string.bzero_opt_host.__internal__
+ libc.src.string.memmove_opt_host.__internal__
+ benchmark_main
)
llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 51fc267df807d..9ed45eedc402e 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -13,9 +13,8 @@ function(add_benchmark benchmark_name)
if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
message(FATAL_ERROR "target does not support clock")
endif()
- add_libc_hermetic_test(
+ add_libc_hermetic(
${benchmark_name}
- IS_BENCHMARK
LINK_LIBRARIES
LibcGpuBenchmark.hermetic
${BENCHMARK_LINK_LIBRARIES}
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
index 8d448b8ced955..79f01425770da 100644
--- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -1,21 +1,21 @@
add_custom_target(libc-gpu-ctype-benchmarks)
add_benchmark(
- isalnum_benchmark
- SUITE
- libc-gpu-ctype-benchmarks
- SRCS
- isalnum_benchmark.cpp
- DEPENDS
- libc.src.ctype.isalnum
+ isalnum_benchmark
+ SUITE
+ libc-gpu-ctype-benchmarks
+ SRCS
+ isalnum_benchmark.cpp
+ DEPENDS
+ libc.src.ctype.isalnum
)
add_benchmark(
- isalpha_benchmark
- SUITE
- libc-gpu-ctype-benchmarks
- SRCS
- isalpha_benchmark.cpp
- DEPENDS
- libc.src.ctype.isalpha
+ isalpha_benchmark
+ SUITE
+ libc-gpu-ctype-benchmarks
+ SRCS
+ isalpha_benchmark.cpp
+ DEPENDS
+ libc.src.ctype.isalpha
)
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 278137774e089..508694ae9fc01 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -526,12 +526,15 @@ function(add_integration_test test_name)
add_dependencies(${INTEGRATION_TEST_SUITE} ${fq_target_name})
endfunction(add_integration_test)
-# Rule to add a hermetic test. A hermetic test is one whose executable is fully
+# Rule to add a hermetic program. A hermetic program is one whose executable is fully
# statically linked and consists of pieces drawn only from LLVM's libc. Nothing,
# including the startup objects, come from the system libc.
#
+# For the GPU, these can be either tests or benchmarks, depending on the value
+# of the LINK_LIBRARIES arg.
+#
# Usage:
-# add_libc_hermetic_test(
+# add_libc_hermetic(
# <target name>
# SUITE <the suite to which the test should belong>
# SRCS <src1.cpp> [src2.cpp ...]
@@ -543,14 +546,14 @@ endfunction(add_integration_test)
# LINK_LIBRARIES <list of linking libraries for this target>
# LOADER_ARGS <list of special args to loaders (like the GPU loader)>
# )
-function(add_libc_hermetic_test test_name)
+function(add_libc_hermetic test_name)
if(NOT TARGET libc.startup.${LIBC_TARGET_OS}.crt1)
message(VERBOSE "Skipping ${fq_target_name} as it is not available on ${LIBC_TARGET_OS}.")
return()
endif()
cmake_parse_arguments(
"HERMETIC_TEST"
- "IS_BENCHMARK" # Optional arguments
+ "" # No optional arguments
"SUITE" # Single value arguments
"SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
${ARGN}
@@ -651,13 +654,6 @@ function(add_libc_hermetic_test test_name)
endif()
endforeach()
- # Benchmarks requires a separate library with a different `main` function
- if(HERMETIC_TEST_IS_BENCHMARK)
- list(APPEND link_libraries LibcGpuBenchmark.hermetic)
- else()
- list(APPEND link_libraries LibcTest.hermetic)
- endif()
-
if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
target_link_options(${fq_build_target_name} PRIVATE
${LIBC_COMPILE_OPTIONS_DEFAULT}
@@ -721,7 +717,7 @@ function(add_libc_hermetic_test test_name)
add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name})
add_dependencies(libc-hermetic-tests ${fq_target_name})
-endfunction(add_libc_hermetic_test)
+endfunction(add_libc_hermetic)
# A convenience function to add both a unit test as well as a hermetic test.
function(add_libc_test test_name)
@@ -736,7 +732,12 @@ function(add_libc_test test_name)
add_libc_unittest(${test_name}.__unit__ ${LIBC_TEST_UNPARSED_ARGUMENTS})
endif()
if(LIBC_ENABLE_HERMETIC_TESTS AND NOT LIBC_TEST_UNIT_TEST_ONLY)
- add_libc_hermetic_test(${test_name}.__hermetic__ ${LIBC_TEST_UNPARSED_ARGUMENTS})
+ add_libc_hermetic(
+ ${test_name}.__hermetic__
+ LINK_LIBRARIES
+ LibcTest.hermetic
+ ${LIBC_TEST_UNPARSED_ARGUMENTS}
+ )
get_fq_target_name(${test_name} fq_test_name)
if(TARGET ${fq_test_name}.__hermetic__ AND TARGET ${fq_test_name}.__unit__)
# Tests like the file tests perform file operations on disk file. If we
>From be303da366eb2e7dd42ad12c206268b4259264c3 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Thu, 23 May 2024 12:34:21 -0400
Subject: [PATCH 07/16] rename namespace and refactor casts
---
libc/benchmarks/gpu/BenchmarkLogger.cpp | 4 +-
libc/benchmarks/gpu/BenchmarkLogger.h | 4 +-
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 31 +++++------
libc/benchmarks/gpu/LibcGpuBenchmark.h | 54 ++++++++------------
libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp | 2 +-
5 files changed, 42 insertions(+), 53 deletions(-)
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
index 4f70d23a1e95e..9a36ee5b3046c 100644
--- a/libc/benchmarks/gpu/BenchmarkLogger.cpp
+++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp
@@ -9,7 +9,7 @@
#include <stdint.h>
namespace LIBC_NAMESPACE {
-namespace libc_gpu_benchmarks {
+namespace benchmarks {
// cpp::string_view specialization
template <>
@@ -93,5 +93,5 @@ template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>);
BenchmarkLogger blog;
-} // namespace libc_gpu_benchmarks
+} // namespace benchmarks
} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h
index ed3cc97e59c6d..98813b28eaa91 100644
--- a/libc/benchmarks/gpu/BenchmarkLogger.h
+++ b/libc/benchmarks/gpu/BenchmarkLogger.h
@@ -10,7 +10,7 @@
#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H
namespace LIBC_NAMESPACE {
-namespace libc_gpu_benchmarks {
+namespace benchmarks {
// A class to log to standard output in the context of hermetic tests.
struct BenchmarkLogger {
@@ -21,7 +21,7 @@ struct BenchmarkLogger {
// A global TestLogger instance to be used in tests.
extern BenchmarkLogger blog;
-} // namespace libc_gpu_benchmarks
+} // namespace benchmarks
} // namespace LIBC_NAMESPACE
#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index f8021c873242f..4c49839249d56 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -4,7 +4,7 @@
#include "src/time/gpu/time_utils.h"
namespace LIBC_NAMESPACE {
-namespace libc_gpu_benchmarks {
+namespace benchmarks {
FixedVector<Benchmark *, 64> benchmarks_to_run;
@@ -23,22 +23,22 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
cpp::function<uint64_t(void)> wrapper_func) {
BenchmarkResult result;
RuntimeEstimationProgression rep;
- size_t total_iterations = 0;
- size_t iterations = options.initial_iterations;
- if (iterations < (uint32_t)1)
+ uint32_t total_iterations = 0;
+ uint32_t iterations = options.initial_iterations;
+ if (iterations < 1u)
iterations = 1;
- size_t samples = 0;
+ uint32_t samples = 0;
uint64_t total_time = 0;
uint64_t best_guess = 0;
uint64_t total_cycles = 0;
- uint64_t cycles_2 = 0;
- uint64_t min = UINT_MAX;
+ uint64_t cycles_squared = 0;
+ uint64_t min = UINT64_MAX;
uint64_t max = 0;
- for (;;) {
+ for (uint64_t time_budget = options.max_duration; time_budget >= 0;) {
uint64_t sample_cycles = 0;
uint64_t overhead = LIBC_NAMESPACE::overhead();
- const clock_t start = (double)clock();
+ const clock_t start = static_cast<double>(clock());
for (uint32_t i = 0; i < iterations; i++) {
auto wrapper_intermediate = wrapper_func();
uint64_t result = wrapper_intermediate - overhead;
@@ -50,9 +50,10 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
const clock_t duration_ns =
((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
total_time += duration_ns;
+ time_budget -= duration_ns;
samples++;
total_cycles += sample_cycles;
- cycles_2 += sample_cycles * sample_cycles;
+ cycles_squared += sample_cycles * sample_cycles;
total_iterations += iterations;
const double change_ratio =
@@ -60,8 +61,7 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
best_guess = rep.current_estimation;
if (samples >= options.max_samples ||
- iterations >= options.max_iterations ||
- total_time >= options.max_duration) {
+ iterations >= options.max_iterations) {
break;
} else if (total_time >= options.min_duration &&
samples >= options.min_samples &&
@@ -72,8 +72,9 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
iterations *= options.scaling_factor;
}
result.cycles = best_guess;
- result.standard_deviation = fputil::sqrt((double)cycles_2 / total_iterations -
- (best_guess * best_guess));
+ result.standard_deviation =
+ fputil::sqrt(static_cast<double>(cycles_squared) / total_iterations -
+ (best_guess * best_guess));
result.min = min;
result.max = max;
result.samples = samples;
@@ -82,5 +83,5 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
return result;
};
-} // namespace libc_gpu_benchmarks
+} // namespace benchmarks
} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 459e4d9b6ea98..20543af66e331 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -9,12 +9,11 @@
#include "src/__support/fixedvector.h"
#include "src/time/clock.h"
-#include <stddef.h>
#include <stdint.h>
namespace LIBC_NAMESPACE {
-namespace libc_gpu_benchmarks {
+namespace benchmarks {
struct BenchmarkOptions {
uint32_t initial_iterations = 1;
@@ -28,13 +27,13 @@ struct BenchmarkOptions {
};
struct Measurement {
- size_t iterations = 0;
+ uint32_t iterations = 0;
uint64_t elapsed_cycles = 0;
};
class RefinableRuntimeEstimation {
uint64_t total_cycles = 0;
- size_t total_iterations = 0;
+ uint32_t total_iterations = 0;
public:
uint64_t update(const Measurement &M) {
@@ -53,7 +52,8 @@ class RuntimeEstimationProgression {
double compute_improvement(const Measurement &M) {
const uint64_t new_estimation = rre.update(M);
- double ratio = ((double)current_estimation / new_estimation) - 1.0;
+ double ratio =
+ (static_cast<double>(current_estimation) / new_estimation) - 1.0;
// Get absolute value
if (ratio < 0)
@@ -67,10 +67,10 @@ class RuntimeEstimationProgression {
struct BenchmarkResult {
uint64_t cycles = 0;
double standard_deviation = 0;
- uint64_t min = UINT_MAX;
+ uint64_t min = UINT64_MAX;
uint64_t max = 0;
- size_t samples = 0;
- size_t total_iterations = 0;
+ uint32_t samples = 0;
+ uint32_t total_iterations = 0;
clock_t total_time = 0;
};
@@ -78,35 +78,22 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
cpp::function<uint64_t(void)> wrapper_func);
class Benchmark {
- Benchmark *next = nullptr;
-
-public:
- virtual ~Benchmark() {}
- virtual void set_up() {}
- virtual void tear_down() {}
-
- static int run_benchmarks();
-
-protected:
- static void add_benchmark(Benchmark *);
-
-private:
- virtual void run() = 0;
- virtual const cpp::string_view get_name() const = 0;
-};
-
-class WrapperBenchmark : public Benchmark {
const cpp::function<uint64_t(void)> func;
const cpp::string_view name;
public:
- WrapperBenchmark(cpp::function<uint64_t(void)> func, char const *name)
+ Benchmark(cpp::function<uint64_t(void)> func, char const *name)
: func(func), name(name) {
add_benchmark(this);
}
+ static int run_benchmarks();
+
+protected:
+ static void add_benchmark(Benchmark *benchmark);
+
private:
- void run() override {
+ void run() {
BenchmarkOptions options;
auto result = benchmark(options, func);
constexpr auto GREEN = "\033[32m";
@@ -115,15 +102,16 @@ class WrapperBenchmark : public Benchmark {
blog << GREEN << "[ OK ] " << RESET << name << ": " << result.cycles
<< " cycles, " << result.min << " min, " << result.max << " max, "
<< result.total_iterations << " iterations, " << result.total_time
- << " ns, " << (long)result.standard_deviation << " stddev\n";
+ << " ns, " << static_cast<long>(result.standard_deviation)
+ << " stddev\n";
}
- const cpp::string_view get_name() const override { return name; }
+ const cpp::string_view get_name() const { return name; }
};
-} // namespace libc_gpu_benchmarks
+} // namespace benchmarks
} // namespace LIBC_NAMESPACE
#define BENCHMARK(SuiteName, TestName, Func) \
- LIBC_NAMESPACE::libc_gpu_benchmarks::WrapperBenchmark \
- SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName);
+ LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
+ Func, #SuiteName "." #TestName);
#endif
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
index 510fd13210494..97366e55194a9 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp
@@ -1,6 +1,6 @@
#include "LibcGpuBenchmark.h"
extern "C" int main(int argc, char **argv, char **envp) {
- LIBC_NAMESPACE::libc_gpu_benchmarks::Benchmark::run_benchmarks();
+ LIBC_NAMESPACE::benchmarks::Benchmark::run_benchmarks();
return 0;
}
>From a41eb326ad5d4d99ba24ab17c5ae55ce022b43af Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Mon, 27 May 2024 11:32:30 -0400
Subject: [PATCH 08/16] repeat overhead measurment outside of loop
---
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 4c49839249d56..e91d2b400444a 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -35,9 +35,15 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
uint64_t cycles_squared = 0;
uint64_t min = UINT64_MAX;
uint64_t max = 0;
+
+ uint64_t total_overhead_cycles = 0;
+ uint32_t overhead_iterations = 10;
+ for (int i = 0; i < overhead_iterations; i++)
+ total_overhead_cycles += LIBC_NAMESPACE::overhead();
+ uint64_t overhead = total_overhead_cycles / overhead_iterations;
+
for (uint64_t time_budget = options.max_duration; time_budget >= 0;) {
uint64_t sample_cycles = 0;
- uint64_t overhead = LIBC_NAMESPACE::overhead();
const clock_t start = static_cast<double>(clock());
for (uint32_t i = 0; i < iterations; i++) {
auto wrapper_intermediate = wrapper_func();
>From ab6b6cae819bdd2ba7da32292063a7bcd6620e10 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Mon, 27 May 2024 11:37:14 -0400
Subject: [PATCH 09/16] switch to using min measurement for overhead
---
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index e91d2b400444a..f0ba3af23a140 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -36,11 +36,10 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
uint64_t min = UINT64_MAX;
uint64_t max = 0;
- uint64_t total_overhead_cycles = 0;
- uint32_t overhead_iterations = 10;
+ uint64_t overhead = UINT64_MAX;
+ int overhead_iterations = 10;
for (int i = 0; i < overhead_iterations; i++)
- total_overhead_cycles += LIBC_NAMESPACE::overhead();
- uint64_t overhead = total_overhead_cycles / overhead_iterations;
+ overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
for (uint64_t time_budget = options.max_duration; time_budget >= 0;) {
uint64_t sample_cycles = 0;
>From c7c8445f76fef4923d0607208a621bcd7a8ef58d Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Tue, 28 May 2024 22:46:04 -0400
Subject: [PATCH 10/16] fix style
---
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 19 +++++++------------
libc/benchmarks/gpu/LibcGpuBenchmark.h | 2 +-
2 files changed, 8 insertions(+), 13 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index f0ba3af23a140..e4f839e361dd0 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -6,17 +6,15 @@
namespace LIBC_NAMESPACE {
namespace benchmarks {
-FixedVector<Benchmark *, 64> benchmarks_to_run;
+FixedVector<Benchmark *, 64> benchmarks;
void Benchmark::add_benchmark(Benchmark *benchmark) {
- benchmarks_to_run.push_back(benchmark);
+ benchmarks.push_back(benchmark);
}
-int Benchmark::run_benchmarks() {
- for (auto it = benchmarks_to_run.rbegin(), e = benchmarks_to_run.rend();
- it != e; ++it)
+void Benchmark::run_benchmarks() {
+ for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it)
(*it)->run();
- return 0;
}
BenchmarkResult benchmark(const BenchmarkOptions &options,
@@ -65,14 +63,11 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
rep.compute_improvement({iterations, sample_cycles});
best_guess = rep.current_estimation;
- if (samples >= options.max_samples ||
- iterations >= options.max_iterations) {
+ if (samples >= options.max_samples || iterations >= options.max_iterations)
break;
- } else if (total_time >= options.min_duration &&
- samples >= options.min_samples &&
- change_ratio < options.epsilon) {
+ if (total_time >= options.min_duration && samples >= options.min_samples &&
+ change_ratio < options.epsilon)
break;
- }
iterations *= options.scaling_factor;
}
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 20543af66e331..08e99dadc8d07 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -87,7 +87,7 @@ class Benchmark {
add_benchmark(this);
}
- static int run_benchmarks();
+ static void run_benchmarks();
protected:
static void add_benchmark(Benchmark *benchmark);
>From c857891c2f30ffba251fcce6be2d647d39a2bf69 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Tue, 28 May 2024 23:07:26 -0400
Subject: [PATCH 11/16] unconditionally add benchmarks in gpu build
---
libc/CMakeLists.txt | 4 +---
libc/benchmarks/CMakeLists.txt | 5 +++++
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index f35471a06a53e..4ffcd55ba9500 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -401,9 +401,7 @@ if(LLVM_INCLUDE_TESTS)
add_subdirectory(fuzzing)
endif()
-if(LIBC_INCLUDE_BENCHMARKS)
- add_subdirectory(benchmarks)
-endif()
+add_subdirectory(benchmarks)
if (LIBC_INCLUDE_DOCS)
add_subdirectory(docs)
diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index 0234ccb2a7a78..0cff6eb12c247 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -3,6 +3,11 @@ if(LIBC_TARGET_OS_IS_GPU)
return()
endif()
+# The CPU build depends on Google benchmark.
+if(NOT LIBC_INCLUDE_BENCHMARKS)
+ return()
+endif()
+
find_package(Threads)
set(LLVM_LINK_COMPONENTS
>From 6073de7b30620397831ba76b5d587b88e035c14e Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 29 May 2024 22:30:00 -0400
Subject: [PATCH 12/16] add forward iterator
---
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 4 +-
libc/src/__support/CPP/array.h | 26 +++++----
libc/src/__support/CPP/iterator.h | 68 ++++++++++++++++++++++++
libc/src/__support/fixedvector.h | 4 ++
4 files changed, 90 insertions(+), 12 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index e4f839e361dd0..a7a02cacc3305 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -13,8 +13,8 @@ void Benchmark::add_benchmark(Benchmark *benchmark) {
}
void Benchmark::run_benchmarks() {
- for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it)
- (*it)->run();
+ for (Benchmark *benchmark : benchmarks)
+ benchmark->run();
}
BenchmarkResult benchmark(const BenchmarkOptions &options,
diff --git a/libc/src/__support/CPP/array.h b/libc/src/__support/CPP/array.h
index 4e69ba003e800..7e4cf29847daf 100644
--- a/libc/src/__support/CPP/array.h
+++ b/libc/src/__support/CPP/array.h
@@ -22,10 +22,12 @@ template <class T, size_t N> struct array {
T Data[N];
using value_type = T;
- using iterator = T *;
- using const_iterator = const T *;
- using reverse_iterator = cpp::reverse_iterator<iterator>;
- using const_reverse_iterator = cpp::reverse_iterator<const_iterator>;
+ using pointer_type = T *;
+ using iterator = cpp::iterator<pointer_type>;
+ using const_pointer_type = const T *;
+ using const_iterator = cpp::iterator<const_pointer_type>;
+ using reverse_iterator = cpp::reverse_iterator<pointer_type>;
+ using const_reverse_iterator = cpp::reverse_iterator<const_pointer_type>;
LIBC_INLINE constexpr T *data() { return Data; }
LIBC_INLINE constexpr const T *data() const { return Data; }
@@ -46,12 +48,16 @@ template <class T, size_t N> struct array {
LIBC_INLINE constexpr bool empty() const { return N == 0; }
- LIBC_INLINE constexpr iterator begin() { return Data; }
- LIBC_INLINE constexpr const_iterator begin() const { return Data; }
+ LIBC_INLINE constexpr iterator begin() { return iterator{Data}; }
+ LIBC_INLINE constexpr const_iterator begin() const {
+ return const_iterator{Data};
+ }
LIBC_INLINE constexpr const_iterator cbegin() const { return begin(); }
- LIBC_INLINE constexpr iterator end() { return Data + N; }
- LIBC_INLINE constexpr const_iterator end() const { return Data + N; }
+ LIBC_INLINE constexpr iterator end() { return iterator{Data + N}; }
+ LIBC_INLINE constexpr const_iterator end() const {
+ return const_iterator{Data + N};
+ }
LIBC_INLINE constexpr const_iterator cend() const { return end(); }
LIBC_INLINE constexpr reverse_iterator rbegin() {
@@ -65,10 +71,10 @@ template <class T, size_t N> struct array {
}
LIBC_INLINE constexpr reverse_iterator rend() {
- return reverse_iterator{begin()};
+ return reverse_iterator{Data};
}
LIBC_INLINE constexpr const_reverse_iterator rend() const {
- return const_reverse_iterator{begin()};
+ return const_reverse_iterator{Data};
}
LIBC_INLINE constexpr const_reverse_iterator crend() const { return rend(); }
};
diff --git a/libc/src/__support/CPP/iterator.h b/libc/src/__support/CPP/iterator.h
index b0fd5c9f22ae0..37d631b01582e 100644
--- a/libc/src/__support/CPP/iterator.h
+++ b/libc/src/__support/CPP/iterator.h
@@ -92,6 +92,74 @@ template <typename Iter> class reverse_iterator {
}
};
+template <typename Iter> class iterator {
+ Iter current;
+
+public:
+ using reference = typename iterator_traits<Iter>::reference;
+ using value_type = typename iterator_traits<Iter>::value_type;
+ using iterator_type = Iter;
+
+ LIBC_INLINE iterator() : current() {}
+ LIBC_INLINE constexpr explicit iterator(Iter it) : current(it) {}
+
+ template <typename Other,
+ cpp::enable_if_t<!cpp::is_same_v<Iter, Other> &&
+ cpp::is_convertible_v<const Other &, Iter>,
+ int> = 0>
+ LIBC_INLINE constexpr explicit iterator(const Other &it) : current(it) {}
+
+ LIBC_INLINE friend constexpr bool operator==(const iterator &lhs,
+ const iterator &rhs) {
+ return lhs.base() == rhs.base();
+ }
+
+ LIBC_INLINE friend constexpr bool operator!=(const iterator &lhs,
+ const iterator &rhs) {
+ return lhs.base() != rhs.base();
+ }
+
+ LIBC_INLINE friend constexpr bool operator<(const iterator &lhs,
+ const iterator &rhs) {
+ return lhs.base() < rhs.base();
+ }
+
+ LIBC_INLINE friend constexpr bool operator<=(const iterator &lhs,
+ const iterator &rhs) {
+ return lhs.base() <= rhs.base();
+ }
+
+ LIBC_INLINE friend constexpr bool operator>(const iterator &lhs,
+ const iterator &rhs) {
+ return lhs.base() > rhs.base();
+ }
+
+ LIBC_INLINE friend constexpr bool operator>=(const iterator &lhs,
+ const iterator &rhs) {
+ return lhs.base() >= rhs.base();
+ }
+
+ LIBC_INLINE constexpr iterator_type base() const { return current; }
+
+ LIBC_INLINE constexpr reference operator*() const {
+ Iter tmp = current;
+ return *tmp;
+ }
+ LIBC_INLINE constexpr iterator operator--() {
+ --current;
+ return *this;
+ }
+ LIBC_INLINE constexpr iterator &operator++() {
+ ++current;
+ return *this;
+ }
+ LIBC_INLINE constexpr iterator operator++(int) {
+ iterator tmp(*this);
+ ++current;
+ return tmp;
+ }
+};
+
} // namespace cpp
} // namespace LIBC_NAMESPACE
diff --git a/libc/src/__support/fixedvector.h b/libc/src/__support/fixedvector.h
index 403b1620d20df..ef00fede07366 100644
--- a/libc/src/__support/fixedvector.h
+++ b/libc/src/__support/fixedvector.h
@@ -82,6 +82,10 @@ template <typename T, size_t CAPACITY> class FixedVector {
// can easily swap one data structure for the other.
static void destroy(FixedVector<T, CAPACITY> *store) { store->reset(); }
+ using iterator = typename cpp::array<T, CAPACITY>::iterator;
+ LIBC_INLINE constexpr iterator begin() { return iterator{&store[0]}; }
+ LIBC_INLINE constexpr iterator end() { return iterator{&store[item_count]}; }
+
using reverse_iterator = typename cpp::array<T, CAPACITY>::reverse_iterator;
LIBC_INLINE constexpr reverse_iterator rbegin() {
return reverse_iterator{&store[item_count]};
>From 9f23d216e4b98aedf4f6bc2a56b70f05edb9b6a4 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 29 May 2024 22:40:02 -0400
Subject: [PATCH 13/16] rename logger
---
libc/benchmarks/gpu/BenchmarkLogger.cpp | 2 +-
libc/benchmarks/gpu/BenchmarkLogger.h | 2 +-
libc/benchmarks/gpu/LibcGpuBenchmark.h | 12 ++++++------
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
index 9a36ee5b3046c..2e7e8e7600fdb 100644
--- a/libc/benchmarks/gpu/BenchmarkLogger.cpp
+++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp
@@ -91,7 +91,7 @@ template BenchmarkLogger &BenchmarkLogger::operator<< <UInt<320>>(UInt<320>);
// TODO: Add floating point formatting once it's supported by StringStream.
-BenchmarkLogger blog;
+BenchmarkLogger log;
} // namespace benchmarks
} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h
index 98813b28eaa91..332ff1439e6f5 100644
--- a/libc/benchmarks/gpu/BenchmarkLogger.h
+++ b/libc/benchmarks/gpu/BenchmarkLogger.h
@@ -19,7 +19,7 @@ struct BenchmarkLogger {
};
// A global TestLogger instance to be used in tests.
-extern BenchmarkLogger blog;
+extern BenchmarkLogger log;
} // namespace benchmarks
} // namespace LIBC_NAMESPACE
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 08e99dadc8d07..2a6fcd5ea2556 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -98,12 +98,12 @@ class Benchmark {
auto result = benchmark(options, func);
constexpr auto GREEN = "\033[32m";
constexpr auto RESET = "\033[0m";
- blog << GREEN << "[ RUN ] " << RESET << name << '\n';
- blog << GREEN << "[ OK ] " << RESET << name << ": " << result.cycles
- << " cycles, " << result.min << " min, " << result.max << " max, "
- << result.total_iterations << " iterations, " << result.total_time
- << " ns, " << static_cast<long>(result.standard_deviation)
- << " stddev\n";
+ log << GREEN << "[ RUN ] " << RESET << name << '\n';
+ log << GREEN << "[ OK ] " << RESET << name << ": " << result.cycles
+ << " cycles, " << result.min << " min, " << result.max << " max, "
+ << result.total_iterations << " iterations, " << result.total_time
+ << " ns, " << static_cast<long>(result.standard_deviation)
+ << " stddev\n";
}
const cpp::string_view get_name() const { return name; }
};
>From 46b5e25304e48e896297790ce17c2ac93db5a4b2 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 29 May 2024 23:15:48 -0400
Subject: [PATCH 14/16] Revert "add forward iterator"
This reverts commit a5ebf57f198cd79be132854b036f904c3983341d.
---
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 4 +-
libc/src/__support/CPP/array.h | 26 ++++-----
libc/src/__support/CPP/iterator.h | 68 ------------------------
libc/src/__support/fixedvector.h | 4 --
4 files changed, 12 insertions(+), 90 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index a7a02cacc3305..e4f839e361dd0 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -13,8 +13,8 @@ void Benchmark::add_benchmark(Benchmark *benchmark) {
}
void Benchmark::run_benchmarks() {
- for (Benchmark *benchmark : benchmarks)
- benchmark->run();
+ for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it)
+ (*it)->run();
}
BenchmarkResult benchmark(const BenchmarkOptions &options,
diff --git a/libc/src/__support/CPP/array.h b/libc/src/__support/CPP/array.h
index 7e4cf29847daf..4e69ba003e800 100644
--- a/libc/src/__support/CPP/array.h
+++ b/libc/src/__support/CPP/array.h
@@ -22,12 +22,10 @@ template <class T, size_t N> struct array {
T Data[N];
using value_type = T;
- using pointer_type = T *;
- using iterator = cpp::iterator<pointer_type>;
- using const_pointer_type = const T *;
- using const_iterator = cpp::iterator<const_pointer_type>;
- using reverse_iterator = cpp::reverse_iterator<pointer_type>;
- using const_reverse_iterator = cpp::reverse_iterator<const_pointer_type>;
+ using iterator = T *;
+ using const_iterator = const T *;
+ using reverse_iterator = cpp::reverse_iterator<iterator>;
+ using const_reverse_iterator = cpp::reverse_iterator<const_iterator>;
LIBC_INLINE constexpr T *data() { return Data; }
LIBC_INLINE constexpr const T *data() const { return Data; }
@@ -48,16 +46,12 @@ template <class T, size_t N> struct array {
LIBC_INLINE constexpr bool empty() const { return N == 0; }
- LIBC_INLINE constexpr iterator begin() { return iterator{Data}; }
- LIBC_INLINE constexpr const_iterator begin() const {
- return const_iterator{Data};
- }
+ LIBC_INLINE constexpr iterator begin() { return Data; }
+ LIBC_INLINE constexpr const_iterator begin() const { return Data; }
LIBC_INLINE constexpr const_iterator cbegin() const { return begin(); }
- LIBC_INLINE constexpr iterator end() { return iterator{Data + N}; }
- LIBC_INLINE constexpr const_iterator end() const {
- return const_iterator{Data + N};
- }
+ LIBC_INLINE constexpr iterator end() { return Data + N; }
+ LIBC_INLINE constexpr const_iterator end() const { return Data + N; }
LIBC_INLINE constexpr const_iterator cend() const { return end(); }
LIBC_INLINE constexpr reverse_iterator rbegin() {
@@ -71,10 +65,10 @@ template <class T, size_t N> struct array {
}
LIBC_INLINE constexpr reverse_iterator rend() {
- return reverse_iterator{Data};
+ return reverse_iterator{begin()};
}
LIBC_INLINE constexpr const_reverse_iterator rend() const {
- return const_reverse_iterator{Data};
+ return const_reverse_iterator{begin()};
}
LIBC_INLINE constexpr const_reverse_iterator crend() const { return rend(); }
};
diff --git a/libc/src/__support/CPP/iterator.h b/libc/src/__support/CPP/iterator.h
index 37d631b01582e..b0fd5c9f22ae0 100644
--- a/libc/src/__support/CPP/iterator.h
+++ b/libc/src/__support/CPP/iterator.h
@@ -92,74 +92,6 @@ template <typename Iter> class reverse_iterator {
}
};
-template <typename Iter> class iterator {
- Iter current;
-
-public:
- using reference = typename iterator_traits<Iter>::reference;
- using value_type = typename iterator_traits<Iter>::value_type;
- using iterator_type = Iter;
-
- LIBC_INLINE iterator() : current() {}
- LIBC_INLINE constexpr explicit iterator(Iter it) : current(it) {}
-
- template <typename Other,
- cpp::enable_if_t<!cpp::is_same_v<Iter, Other> &&
- cpp::is_convertible_v<const Other &, Iter>,
- int> = 0>
- LIBC_INLINE constexpr explicit iterator(const Other &it) : current(it) {}
-
- LIBC_INLINE friend constexpr bool operator==(const iterator &lhs,
- const iterator &rhs) {
- return lhs.base() == rhs.base();
- }
-
- LIBC_INLINE friend constexpr bool operator!=(const iterator &lhs,
- const iterator &rhs) {
- return lhs.base() != rhs.base();
- }
-
- LIBC_INLINE friend constexpr bool operator<(const iterator &lhs,
- const iterator &rhs) {
- return lhs.base() < rhs.base();
- }
-
- LIBC_INLINE friend constexpr bool operator<=(const iterator &lhs,
- const iterator &rhs) {
- return lhs.base() <= rhs.base();
- }
-
- LIBC_INLINE friend constexpr bool operator>(const iterator &lhs,
- const iterator &rhs) {
- return lhs.base() > rhs.base();
- }
-
- LIBC_INLINE friend constexpr bool operator>=(const iterator &lhs,
- const iterator &rhs) {
- return lhs.base() >= rhs.base();
- }
-
- LIBC_INLINE constexpr iterator_type base() const { return current; }
-
- LIBC_INLINE constexpr reference operator*() const {
- Iter tmp = current;
- return *tmp;
- }
- LIBC_INLINE constexpr iterator operator--() {
- --current;
- return *this;
- }
- LIBC_INLINE constexpr iterator &operator++() {
- ++current;
- return *this;
- }
- LIBC_INLINE constexpr iterator operator++(int) {
- iterator tmp(*this);
- ++current;
- return tmp;
- }
-};
-
} // namespace cpp
} // namespace LIBC_NAMESPACE
diff --git a/libc/src/__support/fixedvector.h b/libc/src/__support/fixedvector.h
index ef00fede07366..403b1620d20df 100644
--- a/libc/src/__support/fixedvector.h
+++ b/libc/src/__support/fixedvector.h
@@ -82,10 +82,6 @@ template <typename T, size_t CAPACITY> class FixedVector {
// can easily swap one data structure for the other.
static void destroy(FixedVector<T, CAPACITY> *store) { store->reset(); }
- using iterator = typename cpp::array<T, CAPACITY>::iterator;
- LIBC_INLINE constexpr iterator begin() { return iterator{&store[0]}; }
- LIBC_INLINE constexpr iterator end() { return iterator{&store[item_count]}; }
-
using reverse_iterator = typename cpp::array<T, CAPACITY>::reverse_iterator;
LIBC_INLINE constexpr reverse_iterator rbegin() {
return reverse_iterator{&store[item_count]};
>From 945090f8cc726be9560411d219421f2d7e5da775 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 15 Jun 2024 12:15:31 -0400
Subject: [PATCH 15/16] support multithreaded benchmarks
---
libc/benchmarks/gpu/CMakeLists.txt | 1 +
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 60 +++++++++++++++++++++-
libc/benchmarks/gpu/LibcGpuBenchmark.h | 13 +----
libc/benchmarks/gpu/timing/nvptx/timing.h | 6 ---
libc/cmake/modules/LLVMLibCTestRules.cmake | 8 ++-
5 files changed, 67 insertions(+), 21 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 9ed45eedc402e..2814434ccd26c 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -15,6 +15,7 @@ function(add_benchmark benchmark_name)
endif()
add_libc_hermetic(
${benchmark_name}
+ IS_BENCHMARK
LINK_LIBRARIES
LibcGpuBenchmark.hermetic
${BENCHMARK_LINK_LIBRARIES}
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index e4f839e361dd0..0776ebf950ddf 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -1,20 +1,76 @@
#include "LibcGpuBenchmark.h"
#include "src/__support/CPP/algorithm.h"
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/string.h"
#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/GPU/utils.h"
+#include "src/__support/fixedvector.h"
#include "src/time/gpu/time_utils.h"
namespace LIBC_NAMESPACE {
namespace benchmarks {
FixedVector<Benchmark *, 64> benchmarks;
+cpp::array<BenchmarkResult, 1024> results;
void Benchmark::add_benchmark(Benchmark *benchmark) {
benchmarks.push_back(benchmark);
}
+BenchmarkResult reduce_results(cpp::array<BenchmarkResult, 1024> &results) {
+ BenchmarkResult result;
+ uint64_t cycles_sum = 0;
+ double standard_deviation_sum = 0;
+ uint64_t min = UINT64_MAX;
+ uint64_t max = 0;
+ uint32_t samples_sum = 0;
+ uint32_t iterations_sum = 0;
+ clock_t time_sum = 0;
+ uint64_t num_threads = gpu::get_num_threads();
+ for (uint64_t i = 0; i < num_threads; i++) {
+ BenchmarkResult current_result = results[i];
+ cycles_sum += current_result.cycles;
+ standard_deviation_sum += current_result.standard_deviation;
+ min = cpp::min(min, current_result.min);
+ max = cpp::max(max, current_result.max);
+ samples_sum += current_result.samples;
+ iterations_sum += current_result.total_iterations;
+ time_sum += current_result.total_time;
+ }
+ result.cycles = cycles_sum / num_threads;
+ result.standard_deviation = standard_deviation_sum / num_threads;
+ result.min = min;
+ result.max = max;
+ result.samples = samples_sum / num_threads;
+ result.total_iterations = iterations_sum / num_threads;
+ result.total_time = time_sum / num_threads;
+ return result;
+}
+
void Benchmark::run_benchmarks() {
- for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it)
- (*it)->run();
+ uint64_t id = gpu::get_thread_id();
+ gpu::sync_threads();
+
+ for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) {
+ Benchmark *benchmark = *it;
+ results[id] = benchmark->run();
+ }
+ gpu::sync_threads();
+ if (id == 0) {
+ for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) {
+ Benchmark *benchmark = *it;
+ BenchmarkResult all_results = reduce_results(results);
+ constexpr auto GREEN = "\033[32m";
+ constexpr auto RESET = "\033[0m";
+ log << GREEN << "[ RUN ] " << RESET << benchmark->get_name() << '\n';
+ log << GREEN << "[ OK ] " << RESET << benchmark->get_name() << ": "
+ << all_results.cycles << " cycles, " << all_results.min << " min, "
+ << all_results.max << " max, " << all_results.total_iterations
+ << " iterations, " << all_results.total_time << " ns, "
+ << static_cast<long>(all_results.standard_deviation) << " stddev\n";
+ }
+ }
+ gpu::sync_threads();
}
BenchmarkResult benchmark(const BenchmarkOptions &options,
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 2a6fcd5ea2556..59dd589462080 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -6,7 +6,6 @@
#include "src/__support/CPP/functional.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
-#include "src/__support/fixedvector.h"
#include "src/time/clock.h"
#include <stdint.h>
@@ -93,17 +92,9 @@ class Benchmark {
static void add_benchmark(Benchmark *benchmark);
private:
- void run() {
+ BenchmarkResult run() {
BenchmarkOptions options;
- auto result = benchmark(options, func);
- constexpr auto GREEN = "\033[32m";
- constexpr auto RESET = "\033[0m";
- log << GREEN << "[ RUN ] " << RESET << name << '\n';
- log << GREEN << "[ OK ] " << RESET << name << ": " << result.cycles
- << " cycles, " << result.min << " min, " << result.max << " max, "
- << result.total_iterations << " iterations, " << result.total_time
- << " ns, " << static_cast<long>(result.standard_deviation)
- << " stddev\n";
+ return benchmark(options, func);
}
const cpp::string_view get_name() const { return name; }
};
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index 001bdd3686062..5c45425706f11 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -24,13 +24,11 @@ namespace LIBC_NAMESPACE {
[[gnu::noinline]] static uint64_t overhead() {
volatile uint32_t x = 1;
uint32_t y = x;
- gpu::sync_threads();
uint64_t start = gpu::processor_clock();
asm volatile("" ::"r"(y), "llr"(start));
uint32_t result = y;
asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
uint64_t stop = gpu::processor_clock();
- gpu::sync_threads();
volatile auto storage = result;
return stop - start;
}
@@ -47,7 +45,6 @@ template <typename F, typename T>
asm volatile("" ::"r"(arg));
// Get the current timestamp from the clock.
- gpu::sync_threads();
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
@@ -66,7 +63,6 @@ template <typename F, typename T>
// ordering.
uint64_t stop = gpu::processor_clock();
gpu::memory_fence();
- gpu::sync_threads();
asm volatile("" ::"r"(stop));
volatile T output = result;
@@ -82,7 +78,6 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
T2 arg2 = storage2;
asm volatile("" ::"r"(arg), "r"(arg2));
- gpu::sync_threads();
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
@@ -94,7 +89,6 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
uint64_t stop = gpu::processor_clock();
gpu::memory_fence();
- gpu::sync_threads();
asm volatile("" ::"r"(stop));
volatile auto output = result;
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 508694ae9fc01..fbeec32883b63 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -553,7 +553,7 @@ function(add_libc_hermetic test_name)
endif()
cmake_parse_arguments(
"HERMETIC_TEST"
- "" # No optional arguments
+ "IS_BENCHMARK" # Optional arguments
"SUITE" # Single value arguments
"SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments
${ARGN}
@@ -716,7 +716,11 @@ function(add_libc_hermetic test_name)
)
add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name})
- add_dependencies(libc-hermetic-tests ${fq_target_name})
+ if(NOT ${HERMETIC_TEST_IS_BENCHMARK})
+ # If it is a benchmark, it will already have been added to the
+ # gpu-benchmark target
+ add_dependencies(libc-hermetic-tests ${fq_target_name})
+ endif()
endfunction(add_libc_hermetic)
# A convenience function to add both a unit test as well as a hermetic test.
>From 4aa5e8bc05d814e67a332f578ed2893230e90dd7 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 15 Jun 2024 14:26:17 -0400
Subject: [PATCH 16/16] use for each syntax
---
libc/benchmarks/gpu/CMakeLists.txt | 2 +-
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 13 +++++--------
2 files changed, 6 insertions(+), 9 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 2814434ccd26c..4d2a3a4ac66d3 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -47,7 +47,7 @@ add_unittest_framework_library(
libc.src.__support.macros.properties.types
libc.src.__support.OSUtil.osutil
libc.src.__support.uint128
- libc.src.___support.FPUtil.sqrt
+ libc.src.__support.FPUtil.sqrt
libc.src.__support.fixedvector
libc.src.time.clock
libc.benchmarks.gpu.timing.timing
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 0776ebf950ddf..69adb0c95ba76 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -51,14 +51,11 @@ void Benchmark::run_benchmarks() {
uint64_t id = gpu::get_thread_id();
gpu::sync_threads();
- for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) {
- Benchmark *benchmark = *it;
+ for (Benchmark *benchmark : benchmarks)
results[id] = benchmark->run();
- }
gpu::sync_threads();
if (id == 0) {
- for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) {
- Benchmark *benchmark = *it;
+ for (Benchmark *benchmark : benchmarks) {
BenchmarkResult all_results = reduce_results(results);
constexpr auto GREEN = "\033[32m";
constexpr auto RESET = "\033[0m";
@@ -128,9 +125,9 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
iterations *= options.scaling_factor;
}
result.cycles = best_guess;
- result.standard_deviation =
- fputil::sqrt(static_cast<double>(cycles_squared) / total_iterations -
- (best_guess * best_guess));
+ result.standard_deviation = fputil::sqrt<double>(
+ static_cast<double>(cycles_squared) / total_iterations -
+ static_cast<double>(best_guess * best_guess));
result.min = min;
result.max = max;
result.samples = samples;
More information about the libc-commits
mailing list