[libc-commits] [libc] [libc] Add Multithreaded GPU Benchmarks (PR #98964)
via libc-commits
libc-commits at lists.llvm.org
Wed Jul 17 15:29:32 PDT 2024
https://github.com/jameshu15869 updated https://github.com/llvm/llvm-project/pull/98964
>From 25bcbd1765f6c22a8a395a47e42b461fa67eb49e Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 14 Jul 2024 18:35:54 -0400
Subject: [PATCH 1/4] run benchmarks on warps by default, adding the option for
single threaded benchmarks
---
libc/benchmarks/gpu/CMakeLists.txt | 6 ++++++
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 6 ++++--
libc/benchmarks/gpu/LibcGpuBenchmark.h | 14 +++++++++++---
.../benchmarks/gpu/src/ctype/isalnum_benchmark.cpp | 2 ++
4 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index eaeecbdacd23e..8c409bc6ef3ea 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -10,6 +10,10 @@ function(add_benchmark benchmark_name)
"LINK_LIBRARIES" # Multi-value arguments
${ARGN}
)
+ # We run benchmarks for a single warp with and give the
+ # option to run only a single thread
+ set(BENCHMARK_NUM_THREADS 32)
+
if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
message(FATAL_ERROR "target does not support clock")
endif()
@@ -19,6 +23,8 @@ function(add_benchmark benchmark_name)
LINK_LIBRARIES
LibcGpuBenchmark.hermetic
${BENCHMARK_LINK_LIBRARIES}
+ LOADER_ARGS
+ --threads ${BENCHMARK_NUM_THREADS}
${BENCHMARK_UNPARSED_ARGUMENTS}
)
get_fq_target_name(${benchmark_name} fq_target_name)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 23fff3e8180f7..2094d33e1e9e7 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -114,8 +114,10 @@ void Benchmark::run_benchmarks() {
all_results.reset();
gpu::sync_threads();
- auto current_result = b->run();
- all_results.update(current_result);
+ if (!(b->flags & BenchmarkFlags::SINGLE_THREADED) || id == 0) {
+ auto current_result = b->run();
+ all_results.update(current_result);
+ }
gpu::sync_threads();
if (id == 0)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 1f813f8655de6..53f35768e1bf1 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -74,16 +74,19 @@ struct BenchmarkResult {
clock_t total_time = 0;
};
+enum BenchmarkFlags { SINGLE_THREADED = 0x1 };
+
BenchmarkResult benchmark(const BenchmarkOptions &options,
cpp::function<uint64_t(void)> wrapper_func);
class Benchmark {
const cpp::function<uint64_t(void)> func;
const cpp::string_view name;
+ const uint8_t flags;
public:
- Benchmark(cpp::function<uint64_t(void)> func, char const *name)
- : func(func), name(name) {
+ Benchmark(cpp::function<uint64_t(void)> func, char const *name, uint8_t flags)
+ : func(func), name(name), flags(flags) {
add_benchmark(this);
}
@@ -104,6 +107,11 @@ class Benchmark {
#define BENCHMARK(SuiteName, TestName, Func) \
LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
- Func, #SuiteName "." #TestName)
+ Func, #SuiteName "." #TestName, 0)
+
+#define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \
+ LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
+ Func, #SuiteName "." #TestName, \
+ LIBC_NAMESPACE::benchmarks::BenchmarkFlags::SINGLE_THREADED)
#endif
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
index 6f8d247902f76..d9c1a804ec506 100644
--- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -7,6 +7,8 @@ uint64_t BM_IsAlnum() {
return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x);
}
BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnum, BM_IsAlnum);
+SINGLE_THREADED_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleThread,
+ BM_IsAlnum);
uint64_t BM_IsAlnumCapital() {
char x = 'A';
>From b96f564eed7964005125af7e54f51f34445cbe2f Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Tue, 16 Jul 2024 00:56:10 -0400
Subject: [PATCH 2/4] specify threads when registering benchmarks in cmake
---
libc/benchmarks/gpu/CMakeLists.txt | 9 +++++----
libc/benchmarks/gpu/src/ctype/CMakeLists.txt | 4 ++++
2 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 8c409bc6ef3ea..8458842b77fc1 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -6,13 +6,14 @@ function(add_benchmark benchmark_name)
cmake_parse_arguments(
"BENCHMARK"
"" # Optional arguments
- "" # Single value arguments
+ "NUM_THREADS" # Single value arguments
"LINK_LIBRARIES" # Multi-value arguments
${ARGN}
)
- # We run benchmarks for a single warp with and give the
- # option to run only a single thread
- set(BENCHMARK_NUM_THREADS 32)
+
+ if(NOT ${BENCHMARK_NUM_THREADS})
+ set(BENCHMARK_NUM_THREADS 1)
+ endif()
if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
message(FATAL_ERROR "target does not support clock")
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
index 79f01425770da..f41e5c94c5060 100644
--- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -8,6 +8,8 @@ add_benchmark(
isalnum_benchmark.cpp
DEPENDS
libc.src.ctype.isalnum
+ NUM_THREADS
+ 32
)
add_benchmark(
@@ -18,4 +20,6 @@ add_benchmark(
isalpha_benchmark.cpp
DEPENDS
libc.src.ctype.isalpha
+ NUM_THREADS
+ 32
)
>From c49436c98e8599b0234a495a9e662c24c8a91c28 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 17 Jul 2024 00:17:47 -0400
Subject: [PATCH 3/4] correctly handle default arg for num threads
---
libc/benchmarks/gpu/CMakeLists.txt | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 8458842b77fc1..47b541fb2471e 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -11,10 +11,10 @@ function(add_benchmark benchmark_name)
${ARGN}
)
- if(NOT ${BENCHMARK_NUM_THREADS})
+ if(NOT BENCHMARK_NUM_THREADS)
set(BENCHMARK_NUM_THREADS 1)
endif()
-
+
if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
message(FATAL_ERROR "target does not support clock")
endif()
>From 6a2bc11f97b02d90ba32ff045748687666483764 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Wed, 17 Jul 2024 18:28:58 -0400
Subject: [PATCH 4/4] make threads a loader arg and add single wave helper
---
libc/benchmarks/gpu/CMakeLists.txt | 8 +-------
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 5 ++++-
libc/benchmarks/gpu/LibcGpuBenchmark.h | 7 ++++++-
libc/benchmarks/gpu/src/ctype/CMakeLists.txt | 6 ++----
libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp | 2 ++
5 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 47b541fb2471e..dc1c429e604e3 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -6,14 +6,10 @@ function(add_benchmark benchmark_name)
cmake_parse_arguments(
"BENCHMARK"
"" # Optional arguments
- "NUM_THREADS" # Single value arguments
+ "" # Single value arguments
"LINK_LIBRARIES" # Multi-value arguments
${ARGN}
)
-
- if(NOT BENCHMARK_NUM_THREADS)
- set(BENCHMARK_NUM_THREADS 1)
- endif()
if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
message(FATAL_ERROR "target does not support clock")
@@ -24,8 +20,6 @@ function(add_benchmark benchmark_name)
LINK_LIBRARIES
LibcGpuBenchmark.hermetic
${BENCHMARK_LINK_LIBRARIES}
- LOADER_ARGS
- --threads ${BENCHMARK_NUM_THREADS}
${BENCHMARK_UNPARSED_ARGUMENTS}
)
get_fq_target_name(${benchmark_name} fq_target_name)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 2094d33e1e9e7..c926d8efd7db2 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -114,7 +114,10 @@ void Benchmark::run_benchmarks() {
all_results.reset();
gpu::sync_threads();
- if (!(b->flags & BenchmarkFlags::SINGLE_THREADED) || id == 0) {
+ if (!b->flags ||
+ ((b->flags & BenchmarkFlags::SINGLE_THREADED) && id == 0) ||
+ ((b->flags & BenchmarkFlags::SINGLE_WAVE) &&
+ id < gpu::get_lane_size())) {
auto current_result = b->run();
all_results.update(current_result);
}
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 53f35768e1bf1..29d7ba8b0a132 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -74,7 +74,7 @@ struct BenchmarkResult {
clock_t total_time = 0;
};
-enum BenchmarkFlags { SINGLE_THREADED = 0x1 };
+enum BenchmarkFlags { SINGLE_THREADED = 0x1, SINGLE_WAVE = 0x2 };
BenchmarkResult benchmark(const BenchmarkOptions &options,
cpp::function<uint64_t(void)> wrapper_func);
@@ -114,4 +114,9 @@ class Benchmark {
Func, #SuiteName "." #TestName, \
LIBC_NAMESPACE::benchmarks::BenchmarkFlags::SINGLE_THREADED)
+#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
+ LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
+ Func, #SuiteName "." #TestName, \
+ LIBC_NAMESPACE::benchmarks::BenchmarkFlags::SINGLE_WAVE)
+
#endif
diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
index f41e5c94c5060..f277624dbb901 100644
--- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt
@@ -8,8 +8,8 @@ add_benchmark(
isalnum_benchmark.cpp
DEPENDS
libc.src.ctype.isalnum
- NUM_THREADS
- 32
+ LOADER_ARGS
+ --threads 64
)
add_benchmark(
@@ -20,6 +20,4 @@ add_benchmark(
isalpha_benchmark.cpp
DEPENDS
libc.src.ctype.isalpha
- NUM_THREADS
- 32
)
diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
index d9c1a804ec506..ffa5a99860bfc 100644
--- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp
@@ -9,6 +9,8 @@ uint64_t BM_IsAlnum() {
BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnum, BM_IsAlnum);
SINGLE_THREADED_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleThread,
BM_IsAlnum);
+SINGLE_WAVE_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleWave,
+ BM_IsAlnum);
uint64_t BM_IsAlnumCapital() {
char x = 'A';
More information about the libc-commits
mailing list