[libc-commits] [libc] [libc] Use Atomics in GPU Benchmarks (PR #98842)
Joseph Huber via libc-commits
libc-commits at lists.llvm.org
Sun Jul 14 16:32:18 PDT 2024
================
@@ -12,60 +13,113 @@ namespace LIBC_NAMESPACE_DECL {
namespace benchmarks {
FixedVector<Benchmark *, 64> benchmarks;
-cpp::array<BenchmarkResult, 1024> results;
void Benchmark::add_benchmark(Benchmark *benchmark) {
benchmarks.push_back(benchmark);
}
-BenchmarkResult
-reduce_results(const cpp::array<BenchmarkResult, 1024> &results) {
- BenchmarkResult result;
- uint64_t cycles_sum = 0;
- double standard_deviation_sum = 0;
- uint64_t min = UINT64_MAX;
- uint64_t max = 0;
- uint32_t samples_sum = 0;
- uint32_t iterations_sum = 0;
- clock_t time_sum = 0;
- uint64_t num_threads = gpu::get_num_threads();
- for (uint64_t i = 0; i < num_threads; i++) {
- BenchmarkResult current_result = results[i];
- cycles_sum += current_result.cycles;
- standard_deviation_sum += current_result.standard_deviation;
- min = cpp::min(min, current_result.min);
- max = cpp::max(max, current_result.max);
- samples_sum += current_result.samples;
- iterations_sum += current_result.total_iterations;
- time_sum += current_result.total_time;
+struct AtomicBenchmarkSums {
+ cpp::Atomic<uint64_t> cycles_sum = 0;
+ cpp::Atomic<uint64_t> standard_deviation_sum = 0;
+ cpp::Atomic<uint64_t> min = UINT64_MAX;
+ cpp::Atomic<uint64_t> max = 0;
+ cpp::Atomic<uint32_t> samples_sum = 0;
+ cpp::Atomic<uint32_t> iterations_sum = 0;
+ cpp::Atomic<clock_t> time_sum = 0;
+ cpp::Atomic<uint64_t> active_threads = 0;
+
+ void reset() {
+ active_threads.store(0, cpp::MemoryOrder::RELAXED);
+ cycles_sum.store(0, cpp::MemoryOrder::RELAXED);
+ standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED);
+ min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
+ max.store(0, cpp::MemoryOrder::RELAXED);
+ samples_sum.store(0, cpp::MemoryOrder::RELAXED);
+ iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
+ time_sum.store(0, cpp::MemoryOrder::RELAXED);
}
- result.cycles = cycles_sum / num_threads;
- result.standard_deviation = standard_deviation_sum / num_threads;
- result.min = min;
- result.max = max;
- result.samples = samples_sum / num_threads;
- result.total_iterations = iterations_sum / num_threads;
- result.total_time = time_sum / num_threads;
- return result;
+
+ void update(const BenchmarkResult &result) {
+ gpu::memory_fence();
+ active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
+
+ cycles_sum.fetch_add(result.cycles, cpp::MemoryOrder::RELAXED);
+ standard_deviation_sum.fetch_add(
+ static_cast<uint64_t>(result.standard_deviation),
+ cpp::MemoryOrder::RELAXED);
+
+ // Perform a CAS loop to atomically update the min
+ uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
+ while (!min.compare_exchange_strong(
+ orig_min, cpp::min(orig_min, result.min), cpp::MemoryOrder::ACQUIRE,
+ cpp::MemoryOrder::RELAXED)) {
+ }
+
+ // Perform a CAS loop to atomically update the max
+ uint64_t orig_max = max.load(cpp::MemoryOrder::RELAXED);
+ while (!max.compare_exchange_strong(
+ orig_max, cpp::max(orig_max, result.max), cpp::MemoryOrder::ACQUIRE,
+ cpp::MemoryOrder::RELAXED)) {
+ }
+
+ samples_sum.fetch_add(result.samples, cpp::MemoryOrder::RELAXED);
+ iterations_sum.fetch_add(result.total_iterations,
+ cpp::MemoryOrder::RELAXED);
+ time_sum.fetch_add(result.total_time, cpp::MemoryOrder::RELAXED);
+ gpu::memory_fence();
----------------
jhuber6 wrote:
They're two separate things. The memory fence is more "flush all pending memory operations" the thread fence is "memory operations cannot move past this fence"
https://github.com/llvm/llvm-project/pull/98842
More information about the libc-commits
mailing list