[libc-commits] [libc] [libc] Use Atomics in GPU Benchmarks (PR #98842)
via libc-commits
libc-commits at lists.llvm.org
Sun Jul 14 16:23:13 PDT 2024
https://github.com/jameshu15869 updated https://github.com/llvm/llvm-project/pull/98842
>From 568b01acf1d696dc4fd36a5e03ddea9067f95874 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 14 Jul 2024 18:42:37 -0400
Subject: [PATCH 1/3] use atomics instead of reducing
---
libc/benchmarks/gpu/CMakeLists.txt | 1 +
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 129 ++++++++++++++++-------
libc/benchmarks/gpu/LibcGpuBenchmark.h | 4 +-
3 files changed, 93 insertions(+), 41 deletions(-)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index d167abcaf2db1..eaeecbdacd23e 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -43,6 +43,7 @@ add_unittest_framework_library(
libc.src.__support.CPP.functional
libc.src.__support.CPP.limits
libc.src.__support.CPP.algorithm
+ libc.src.__support.CPP.atomic
libc.src.__support.fixed_point.fx_rep
libc.src.__support.macros.properties.types
libc.src.__support.OSUtil.osutil
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 1c1ba7639d0b1..4a6f6267dd170 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -1,6 +1,7 @@
#include "LibcGpuBenchmark.h"
#include "src/__support/CPP/algorithm.h"
#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/string.h"
#include "src/__support/FPUtil/sqrt.h"
#include "src/__support/GPU/utils.h"
@@ -12,41 +13,81 @@ namespace LIBC_NAMESPACE_DECL {
namespace benchmarks {
FixedVector<Benchmark *, 64> benchmarks;
-cpp::array<BenchmarkResult, 1024> results;
void Benchmark::add_benchmark(Benchmark *benchmark) {
benchmarks.push_back(benchmark);
}
-BenchmarkResult
-reduce_results(const cpp::array<BenchmarkResult, 1024> &results) {
- BenchmarkResult result;
- uint64_t cycles_sum = 0;
- double standard_deviation_sum = 0;
- uint64_t min = UINT64_MAX;
- uint64_t max = 0;
- uint32_t samples_sum = 0;
- uint32_t iterations_sum = 0;
- clock_t time_sum = 0;
- uint64_t num_threads = gpu::get_num_threads();
- for (uint64_t i = 0; i < num_threads; i++) {
- BenchmarkResult current_result = results[i];
- cycles_sum += current_result.cycles;
- standard_deviation_sum += current_result.standard_deviation;
- min = cpp::min(min, current_result.min);
- max = cpp::max(max, current_result.max);
- samples_sum += current_result.samples;
- iterations_sum += current_result.total_iterations;
- time_sum += current_result.total_time;
+void update_sums(const BenchmarkResult ¤t_result,
+ cpp::Atomic<uint64_t> &active_threads,
+ cpp::Atomic<uint64_t> &cycles_sum,
+ cpp::Atomic<uint64_t> &standard_deviation_sum,
+ cpp::Atomic<uint64_t> &min, cpp::Atomic<uint64_t> &max,
+ cpp::Atomic<uint32_t> &samples_sum,
+ cpp::Atomic<uint32_t> &iterations_sum,
+ cpp::Atomic<clock_t> &time_sum) {
+ gpu::memory_fence();
+ active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
+
+ cycles_sum.fetch_add(current_result.cycles, cpp::MemoryOrder::RELAXED);
+ standard_deviation_sum.fetch_add(
+ static_cast<uint64_t>(current_result.standard_deviation),
+ cpp::MemoryOrder::RELAXED);
+
+ // Perform a CAS loop to atomically update the min
+ uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
+ while (!min.compare_exchange_strong(
+ orig_min, cpp::min(orig_min, current_result.min),
+ cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED)) {
}
- result.cycles = cycles_sum / num_threads;
- result.standard_deviation = standard_deviation_sum / num_threads;
- result.min = min;
- result.max = max;
- result.samples = samples_sum / num_threads;
- result.total_iterations = iterations_sum / num_threads;
- result.total_time = time_sum / num_threads;
- return result;
+
+ // Perform a CAS loop to atomically update the max
+ uint64_t orig_max = max.load(cpp::MemoryOrder::RELAXED);
+ while (!max.compare_exchange_strong(
+ orig_max, cpp::max(orig_max, current_result.max),
+ cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED)) {
+ }
+
+ samples_sum.fetch_add(current_result.samples, cpp::MemoryOrder::RELAXED);
+ iterations_sum.fetch_add(current_result.total_iterations,
+ cpp::MemoryOrder::RELAXED);
+ time_sum.fetch_add(current_result.total_time, cpp::MemoryOrder::RELAXED);
+ gpu::memory_fence();
+}
+
+cpp::Atomic<uint64_t> cycles_sum = 0;
+cpp::Atomic<uint64_t> standard_deviation_sum = 0;
+cpp::Atomic<uint64_t> min = UINT64_MAX;
+cpp::Atomic<uint64_t> max = 0;
+cpp::Atomic<uint32_t> samples_sum = 0;
+cpp::Atomic<uint32_t> iterations_sum = 0;
+cpp::Atomic<clock_t> time_sum = 0;
+cpp::Atomic<uint64_t> active_threads = 0;
+
+void print_results(Benchmark *b) {
+ constexpr auto GREEN = "\033[32m";
+ constexpr auto RESET = "\033[0m";
+
+ BenchmarkResult result;
+ gpu::memory_fence();
+ int num_threads = active_threads.load(cpp::MemoryOrder::RELAXED);
+ result.cycles = cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ result.standard_deviation =
+ standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ result.min = min.load(cpp::MemoryOrder::RELAXED);
+ result.max = max.load(cpp::MemoryOrder::RELAXED);
+ result.samples = samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ result.total_iterations =
+ iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ result.total_time = time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ gpu::memory_fence();
+ log << GREEN << "[ RUN ] " << RESET << b->get_name() << '\n';
+ log << GREEN << "[ OK ] " << RESET << b->get_name() << ": "
+ << result.cycles << " cycles, " << result.min << " min, " << result.max
+ << " max, " << result.total_iterations << " iterations, "
+ << result.total_time << " ns, "
+ << static_cast<long>(result.standard_deviation)
+ << " stddev (num threads: " << num_threads << ")\n";
}
void Benchmark::run_benchmarks() {
@@ -54,18 +95,28 @@ void Benchmark::run_benchmarks() {
gpu::sync_threads();
for (Benchmark *b : benchmarks) {
- results[id] = b->run();
+ gpu::memory_fence();
+ if (id == 0) {
+ active_threads.store(0, cpp::MemoryOrder::RELAXED);
+ cycles_sum.store(0, cpp::MemoryOrder::RELAXED);
+ standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED);
+ min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
+ max.store(0, cpp::MemoryOrder::RELAXED);
+ samples_sum.store(0, cpp::MemoryOrder::RELAXED);
+ iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
+ time_sum.store(0, cpp::MemoryOrder::RELAXED);
+ }
+ gpu::memory_fence();
gpu::sync_threads();
+
+ auto current_result = b->run();
+ update_sums(current_result, active_threads, cycles_sum,
+ standard_deviation_sum, min, max, samples_sum, iterations_sum,
+ time_sum);
+ gpu::sync_threads();
+
if (id == 0) {
- BenchmarkResult all_results = reduce_results(results);
- constexpr auto GREEN = "\033[32m";
- constexpr auto RESET = "\033[0m";
- log << GREEN << "[ RUN ] " << RESET << b->get_name() << '\n';
- log << GREEN << "[ OK ] " << RESET << b->get_name() << ": "
- << all_results.cycles << " cycles, " << all_results.min << " min, "
- << all_results.max << " max, " << all_results.total_iterations
- << " iterations, " << all_results.total_time << " ns, "
- << static_cast<long>(all_results.standard_deviation) << " stddev\n";
+ print_results(b);
}
}
gpu::sync_threads();
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 26cb0fd30bc1c..1f813f8655de6 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -88,6 +88,7 @@ class Benchmark {
}
static void run_benchmarks();
+ const cpp::string_view get_name() const { return name; }
protected:
static void add_benchmark(Benchmark *benchmark);
@@ -97,13 +98,12 @@ class Benchmark {
BenchmarkOptions options;
return benchmark(options, func);
}
- const cpp::string_view get_name() const { return name; }
};
} // namespace benchmarks
} // namespace LIBC_NAMESPACE_DECL
#define BENCHMARK(SuiteName, TestName, Func) \
LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
- Func, #SuiteName "." #TestName);
+ Func, #SuiteName "." #TestName)
#endif
>From 42f99f6eda758638248255549555dd2c41e564a9 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 14 Jul 2024 19:19:25 -0400
Subject: [PATCH 2/3] create global struct for atomic sums
---
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 123 ++++++++++++-----------
1 file changed, 63 insertions(+), 60 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 4a6f6267dd170..b43266d88c70b 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -18,51 +18,59 @@ void Benchmark::add_benchmark(Benchmark *benchmark) {
benchmarks.push_back(benchmark);
}
-void update_sums(const BenchmarkResult ¤t_result,
- cpp::Atomic<uint64_t> &active_threads,
- cpp::Atomic<uint64_t> &cycles_sum,
- cpp::Atomic<uint64_t> &standard_deviation_sum,
- cpp::Atomic<uint64_t> &min, cpp::Atomic<uint64_t> &max,
- cpp::Atomic<uint32_t> &samples_sum,
- cpp::Atomic<uint32_t> &iterations_sum,
- cpp::Atomic<clock_t> &time_sum) {
- gpu::memory_fence();
- active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
-
- cycles_sum.fetch_add(current_result.cycles, cpp::MemoryOrder::RELAXED);
- standard_deviation_sum.fetch_add(
- static_cast<uint64_t>(current_result.standard_deviation),
- cpp::MemoryOrder::RELAXED);
-
- // Perform a CAS loop to atomically update the min
- uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
- while (!min.compare_exchange_strong(
- orig_min, cpp::min(orig_min, current_result.min),
- cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED)) {
+struct AtomicBenchmarkSums {
+ cpp::Atomic<uint64_t> cycles_sum = 0;
+ cpp::Atomic<uint64_t> standard_deviation_sum = 0;
+ cpp::Atomic<uint64_t> min = UINT64_MAX;
+ cpp::Atomic<uint64_t> max = 0;
+ cpp::Atomic<uint32_t> samples_sum = 0;
+ cpp::Atomic<uint32_t> iterations_sum = 0;
+ cpp::Atomic<clock_t> time_sum = 0;
+ cpp::Atomic<uint64_t> active_threads = 0;
+
+ void reset() {
+ active_threads.store(0, cpp::MemoryOrder::RELAXED);
+ cycles_sum.store(0, cpp::MemoryOrder::RELAXED);
+ standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED);
+ min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
+ max.store(0, cpp::MemoryOrder::RELAXED);
+ samples_sum.store(0, cpp::MemoryOrder::RELAXED);
+ iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
+ time_sum.store(0, cpp::MemoryOrder::RELAXED);
}
- // Perform a CAS loop to atomically update the max
- uint64_t orig_max = max.load(cpp::MemoryOrder::RELAXED);
- while (!max.compare_exchange_strong(
- orig_max, cpp::max(orig_max, current_result.max),
- cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED)) {
- }
+ void update(const BenchmarkResult &result) {
+ gpu::memory_fence();
+ active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
+
+ cycles_sum.fetch_add(result.cycles, cpp::MemoryOrder::RELAXED);
+ standard_deviation_sum.fetch_add(
+ static_cast<uint64_t>(result.standard_deviation),
+ cpp::MemoryOrder::RELAXED);
+
+ // Perform a CAS loop to atomically update the min
+ uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
+ while (!min.compare_exchange_strong(
+ orig_min, cpp::min(orig_min, result.min), cpp::MemoryOrder::ACQUIRE,
+ cpp::MemoryOrder::RELAXED)) {
+ }
- samples_sum.fetch_add(current_result.samples, cpp::MemoryOrder::RELAXED);
- iterations_sum.fetch_add(current_result.total_iterations,
- cpp::MemoryOrder::RELAXED);
- time_sum.fetch_add(current_result.total_time, cpp::MemoryOrder::RELAXED);
- gpu::memory_fence();
-}
+ // Perform a CAS loop to atomically update the max
+ uint64_t orig_max = max.load(cpp::MemoryOrder::RELAXED);
+ while (!max.compare_exchange_strong(
+ orig_max, cpp::max(orig_max, result.max), cpp::MemoryOrder::ACQUIRE,
+ cpp::MemoryOrder::RELAXED)) {
+ }
+
+ samples_sum.fetch_add(result.samples, cpp::MemoryOrder::RELAXED);
+ iterations_sum.fetch_add(result.total_iterations,
+ cpp::MemoryOrder::RELAXED);
+ time_sum.fetch_add(result.total_time, cpp::MemoryOrder::RELAXED);
+ gpu::memory_fence();
+ }
+};
-cpp::Atomic<uint64_t> cycles_sum = 0;
-cpp::Atomic<uint64_t> standard_deviation_sum = 0;
-cpp::Atomic<uint64_t> min = UINT64_MAX;
-cpp::Atomic<uint64_t> max = 0;
-cpp::Atomic<uint32_t> samples_sum = 0;
-cpp::Atomic<uint32_t> iterations_sum = 0;
-cpp::Atomic<clock_t> time_sum = 0;
-cpp::Atomic<uint64_t> active_threads = 0;
+AtomicBenchmarkSums all_results;
void print_results(Benchmark *b) {
constexpr auto GREEN = "\033[32m";
@@ -70,16 +78,20 @@ void print_results(Benchmark *b) {
BenchmarkResult result;
gpu::memory_fence();
- int num_threads = active_threads.load(cpp::MemoryOrder::RELAXED);
- result.cycles = cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
+ result.cycles =
+ all_results.cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
result.standard_deviation =
- standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
- result.min = min.load(cpp::MemoryOrder::RELAXED);
- result.max = max.load(cpp::MemoryOrder::RELAXED);
- result.samples = samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ all_results.standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) /
+ num_threads;
+ result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
+ result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
+ result.samples =
+ all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
result.total_iterations =
- iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
- result.total_time = time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ result.total_time =
+ all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
gpu::memory_fence();
log << GREEN << "[ RUN ] " << RESET << b->get_name() << '\n';
log << GREEN << "[ OK ] " << RESET << b->get_name() << ": "
@@ -97,22 +109,13 @@ void Benchmark::run_benchmarks() {
for (Benchmark *b : benchmarks) {
gpu::memory_fence();
if (id == 0) {
- active_threads.store(0, cpp::MemoryOrder::RELAXED);
- cycles_sum.store(0, cpp::MemoryOrder::RELAXED);
- standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED);
- min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
- max.store(0, cpp::MemoryOrder::RELAXED);
- samples_sum.store(0, cpp::MemoryOrder::RELAXED);
- iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
- time_sum.store(0, cpp::MemoryOrder::RELAXED);
+ all_results.reset();
}
gpu::memory_fence();
gpu::sync_threads();
auto current_result = b->run();
- update_sums(current_result, active_threads, cycles_sum,
- standard_deviation_sum, min, max, samples_sum, iterations_sum,
- time_sum);
+ all_results.update(current_result);
gpu::sync_threads();
if (id == 0) {
>From 00c8be0b1ddb8ff9f930a0d53ed5775b5f75a81b Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 14 Jul 2024 19:22:44 -0400
Subject: [PATCH 3/3] use release instead of relaxed for reading stored values
before printing
---
libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index b43266d88c70b..5326e82ffa715 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -78,27 +78,27 @@ void print_results(Benchmark *b) {
BenchmarkResult result;
gpu::memory_fence();
- int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
+ int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELEASE);
result.cycles =
- all_results.cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ all_results.cycles_sum.load(cpp::MemoryOrder::RELEASE) / num_threads;
result.standard_deviation =
- all_results.standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) /
+ all_results.standard_deviation_sum.load(cpp::MemoryOrder::RELEASE) /
num_threads;
- result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
- result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
+ result.min = all_results.min.load(cpp::MemoryOrder::RELEASE);
+ result.max = all_results.max.load(cpp::MemoryOrder::RELEASE);
result.samples =
- all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ all_results.samples_sum.load(cpp::MemoryOrder::RELEASE) / num_threads;
result.total_iterations =
- all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ all_results.iterations_sum.load(cpp::MemoryOrder::RELEASE) / num_threads;
result.total_time =
- all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
+ all_results.time_sum.load(cpp::MemoryOrder::RELEASE) / num_threads;
gpu::memory_fence();
log << GREEN << "[ RUN ] " << RESET << b->get_name() << '\n';
log << GREEN << "[ OK ] " << RESET << b->get_name() << ": "
<< result.cycles << " cycles, " << result.min << " min, " << result.max
<< " max, " << result.total_iterations << " iterations, "
<< result.total_time << " ns, "
- << static_cast<long>(result.standard_deviation)
+ << static_cast<uint64_t>(result.standard_deviation)
<< " stddev (num threads: " << num_threads << ")\n";
}
More information about the libc-commits
mailing list