[libc-commits] [libc] eb66e31 - [libc] Add Timing Utils for AMDGPU (#96828)
via libc-commits
libc-commits at lists.llvm.org
Wed Jul 10 14:04:59 PDT 2024
Author: jameshu15869
Date: 2024-07-10T16:04:56-05:00
New Revision: eb66e31bc2e2b45c5ccc95300b89c48394084e30
URL: https://github.com/llvm/llvm-project/commit/eb66e31bc2e2b45c5ccc95300b89c48394084e30
DIFF: https://github.com/llvm/llvm-project/commit/eb66e31bc2e2b45c5ccc95300b89c48394084e30.diff
LOG: [libc] Add Timing Utils for AMDGPU (#96828)
PR for adding AMDGPU timing utils for benchmarking.
I was not able to test this code since I do not have an AMD GPU, but I
was able to successfully compile this code using
-DRUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_TEST_ARCHITECTURE=gfx90a
-DRUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_LOADER_EXECUTABLE=echo
-DRUNTIMES_amdgcn_amd-amdhsa_LIBC_GPU_TARGET_ARCHITECTURE=gfx90a to
force the code to compile without having an AMD gpu on my machine.
@jhuber6
Added:
libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
libc/benchmarks/gpu/timing/amdgpu/timing.h
Modified:
libc/benchmarks/gpu/timing/timing.h
Removed:
################################################################################
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
new file mode 100644
index 0000000000000..179429db9a09a
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_header_library(
+ amdgpu_timing
+ HDRS
+ timing.h
+ DEPENDS
+ libc.src.__support.common
+)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
new file mode 100644
index 0000000000000..1eecb2acd2136
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -0,0 +1,112 @@
+//===------------- AMDGPU implementation of timing utils --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
+#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
+
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/GPU/utils.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+
+#include <stdint.h>
+
+// AMDGPU does not support input register constraints for i1 and i8, so we must
+// cast them to uint16_t's before loading them into registers.
+#define FORCE_TO_REGISTER(TYPE, VARIABLE) \
+ if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>) \
+ asm("" ::"v"(static_cast<uint16_t>(VARIABLE))); \
+ else \
+ asm("" ::"v"(VARIABLE))
+
+namespace LIBC_NAMESPACE {
+
+// Returns the overhead associated with calling the profiling region. This
+// allows us to substract the constant-time overhead from the latency to
+// obtain a true result. This can vary with system load.
+[[gnu::noinline]] static LIBC_INLINE uint64_t overhead() {
+ gpu::memory_fence();
+ uint64_t start = gpu::processor_clock();
+ uint32_t result = 0.0;
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+ asm("" ::"s"(start));
+ uint64_t stop = gpu::processor_clock();
+ return stop - start;
+}
+
+// Profile a simple function and obtain its latency in clock cycles on the
+// system. This function cannot be inlined or else it will disturb the very
+// delicate balance of hard-coded dependencies.
+template <typename F, typename T>
+[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
+ // We need to store the input somewhere to guarantee that the compiler
+ // will not constant propagate it and remove the profiling region.
+ volatile T storage = t;
+ T arg = storage;
+
+ FORCE_TO_REGISTER(T, arg);
+
+ // The AMDGPU architecture needs to wait on pending results.
+ gpu::memory_fence();
+ // Get the current timestamp from the clock.
+ uint64_t start = gpu::processor_clock();
+
+ // This forces the compiler to load the input argument and run the clock
+ // cycle counter before the profiling region.
+ FORCE_TO_REGISTER(T, arg);
+ asm("" ::"s"(start));
+
+ // Run the function under test and return its value.
+ auto result = f(arg);
+
+ // This inline assembly performs a no-op which forces the result to both
+ // be used and prevents us from exiting this region before it's complete.
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+
+ // Obtain the current timestamp after running the calculation and force
+ // ordering.
+ uint64_t stop = gpu::processor_clock();
+ asm("" ::"s"(stop));
+ gpu::memory_fence();
+
+ // Return the time elapsed.
+ return stop - start;
+}
+
+template <typename F, typename T1, typename T2>
+[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
+ volatile T1 storage1 = t1;
+ volatile T2 storage2 = t2;
+ T1 arg1 = storage1;
+ T2 arg2 = storage2;
+
+ FORCE_TO_REGISTER(T1, arg1);
+ FORCE_TO_REGISTER(T2, arg2);
+
+ gpu::memory_fence();
+ uint64_t start = gpu::processor_clock();
+
+ FORCE_TO_REGISTER(T1, arg1);
+ FORCE_TO_REGISTER(T2, arg2);
+ asm("" ::"s"(start));
+
+ auto result = f(arg1, arg2);
+
+ asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+
+ uint64_t stop = gpu::processor_clock();
+ asm("" ::"s"(stop));
+ gpu::memory_fence();
+
+ return stop - start;
+}
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
diff --git a/libc/benchmarks/gpu/timing/timing.h b/libc/benchmarks/gpu/timing/timing.h
index 180ea77954ae5..2e098feb4b3a5 100644
--- a/libc/benchmarks/gpu/timing/timing.h
+++ b/libc/benchmarks/gpu/timing/timing.h
@@ -12,7 +12,7 @@
#include "src/__support/macros/properties/architectures.h"
#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
-#error "amdgpu not yet supported"
+#include "amdgpu/timing.h"
#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
#include "nvptx/timing.h"
#else
More information about the libc-commits
mailing list