[libc-commits] [libc] [libc] Add Timing Utils for AMDGPU (PR #96828)

Sun Jun 30 14:25:13 PDT 2024

https://github.com/jameshu15869 updated https://github.com/llvm/llvm-project/pull/96828

>From 0d27d549b2c7f0ff74bd906b51f6c0950ed936ae Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sat, 15 Jun 2024 17:50:59 -0400
Subject: [PATCH 1/2] add timing utils for amdgpu

---
 .../gpu/timing/amdgpu/CMakeLists.txt          |  7 ++
 libc/benchmarks/gpu/timing/amdgpu/timing.h    | 73 +++++++++++++++++++
 libc/benchmarks/gpu/timing/timing.h           |  2 +-
 3 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
 create mode 100644 libc/benchmarks/gpu/timing/amdgpu/timing.h

diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
new file mode 100644
index 0000000000000..179429db9a09a
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_header_library(
+  amdgpu_timing
+  HDRS
+    timing.h
+  DEPENDS
+    libc.src.__support.common
+)
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
new file mode 100644
index 0000000000000..3d13826ffee30
--- /dev/null
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -0,0 +1,73 @@
+//===------------- AMDGPU implementation of timing utils --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
+#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+
+// Returns the overhead associated with calling the profiling region. This
+// allows us to substract the constant-time overhead from the latency to
+// obtain a true result. This can vary with system load.
+[[gnu::noinline]] static LIBC_INLINE uint64_t overhead() {
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+  uint32_t result = 0.0;
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+  asm("" ::"s"(start));
+  uint64_t stop = gpu::processor_clock();
+  return stop - start;
+}
+
+// Profile a simple function and obtain its latency in clock cycles on the
+// system. This function cannot be inlined or else it will disturb the very
+// delicate balance of hard-coded dependencies.
+template <typename F, typename T>
+[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
+  // We need to store the input somewhere to guarantee that the compiler will
+  // not constant propagate it and remove the profiling region.
+  volatile uint32_t storage = t;
+  float arg = storage;
+  asm("" ::"s"(arg));
+
+  // The AMDGPU architecture needs to wait on pending results.
+  gpu::memory_fence();
+  // Get the current timestamp from the clock.
+  uint64_t start = gpu::processor_clock();
+
+  // This forces the compiler to load the input argument and run the clock cycle
+  // counter before the profiling region.
+  asm("" ::"s"(arg), "s"(start));
+
+  // Run the function under test and return its value.
+  auto result = f(arg);
+
+  // This inline assembly performs a no-op which forces the result to both be
+  // used and prevents us from exiting this region before it's complete.
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+
+  // Obtain the current timestamp after running the calculation and force
+  // ordering.
+  uint64_t stop = gpu::processor_clock();
+  asm("" ::"s"(stop));
+  gpu::memory_fence();
+
+  // Return the time elapsed.
+  return stop - start;
+}
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
diff --git a/libc/benchmarks/gpu/timing/timing.h b/libc/benchmarks/gpu/timing/timing.h
index 180ea77954ae5..2e098feb4b3a5 100644
--- a/libc/benchmarks/gpu/timing/timing.h
+++ b/libc/benchmarks/gpu/timing/timing.h
@@ -12,7 +12,7 @@
 #include "src/__support/macros/properties/architectures.h"
 
 #if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
-#error "amdgpu not yet supported"
+#include "amdgpu/timing.h"
 #elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
 #include "nvptx/timing.h"
 #else

>From 2a6f15d1cbde6f3d3feed5029ad7525c21f14ea9 Mon Sep 17 00:00:00 2001
From: jameshu15869 <jhudson15869 at gmail.com>
Date: Sun, 30 Jun 2024 17:24:46 -0400
Subject: [PATCH 2/2] correctly store input arguments into registers

---
 libc/benchmarks/gpu/timing/amdgpu/timing.h | 59 ++++++++++++++++++----
 1 file changed, 49 insertions(+), 10 deletions(-)

diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 3d13826ffee30..1eecb2acd2136 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/attributes.h"
@@ -16,6 +17,14 @@
 
 #include <stdint.h>
 
+// AMDGPU does not support input register constraints for i1 and i8, so we must
+// cast them to uint16_t's before loading them into registers.
+#define FORCE_TO_REGISTER(TYPE, VARIABLE)                                      \
+  if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>)      \
+    asm("" ::"v"(static_cast<uint16_t>(VARIABLE)));                            \
+  else                                                                         \
+    asm("" ::"v"(VARIABLE))
+
 namespace LIBC_NAMESPACE {
 
 // Returns the overhead associated with calling the profiling region. This
@@ -36,26 +45,28 @@ namespace LIBC_NAMESPACE {
 // delicate balance of hard-coded dependencies.
 template <typename F, typename T>
 [[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
-  // We need to store the input somewhere to guarantee that the compiler will
-  // not constant propagate it and remove the profiling region.
-  volatile uint32_t storage = t;
-  float arg = storage;
-  asm("" ::"s"(arg));
+  // We need to store the input somewhere to guarantee that the compiler
+  // will not constant propagate it and remove the profiling region.
+  volatile T storage = t;
+  T arg = storage;
+
+  FORCE_TO_REGISTER(T, arg);
 
   // The AMDGPU architecture needs to wait on pending results.
   gpu::memory_fence();
   // Get the current timestamp from the clock.
   uint64_t start = gpu::processor_clock();
 
-  // This forces the compiler to load the input argument and run the clock cycle
-  // counter before the profiling region.
-  asm("" ::"s"(arg), "s"(start));
+  // This forces the compiler to load the input argument and run the clock
+  // cycle counter before the profiling region.
+  FORCE_TO_REGISTER(T, arg);
+  asm("" ::"s"(start));
 
   // Run the function under test and return its value.
   auto result = f(arg);
 
-  // This inline assembly performs a no-op which forces the result to both be
-  // used and prevents us from exiting this region before it's complete.
+  // This inline assembly performs a no-op which forces the result to both
+  // be used and prevents us from exiting this region before it's complete.
   asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
 
   // Obtain the current timestamp after running the calculation and force
@@ -68,6 +79,34 @@ template <typename F, typename T>
   return stop - start;
 }
 
+template <typename F, typename T1, typename T2>
+[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
+  volatile T1 storage1 = t1;
+  volatile T2 storage2 = t2;
+  T1 arg1 = storage1;
+  T2 arg2 = storage2;
+
+  FORCE_TO_REGISTER(T1, arg1);
+  FORCE_TO_REGISTER(T2, arg2);
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  FORCE_TO_REGISTER(T1, arg1);
+  FORCE_TO_REGISTER(T2, arg2);
+  asm("" ::"s"(start));
+
+  auto result = f(arg1, arg2);
+
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+
+  uint64_t stop = gpu::processor_clock();
+  asm("" ::"s"(stop));
+  gpu::memory_fence();
+
+  return stop - start;
+}
+
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU