[libc-commits] [libc] [libc] Handle differing wavefront sizes correctly in the AMDHSA loader (PR #117788)
Joseph Huber via libc-commits
libc-commits at lists.llvm.org
Tue Nov 26 12:55:24 PST 2024
https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/117788
Summary:
The AMDGPU backend can handle wavefront sizes of 32 and 64, with the
native hardware preferring one or the other. The user can override the
hardware with `-mwavefrontsize64` or `-mwavefrontsize32` which
previously wasn't handled. We need to know the wavefront size to know
how much memory to allocate and how to index the RPC buffer. There isn't
a good way to do this with ROCm so we just use the LLVM support for
offloading to check this from the image.
>From 3c2b2e66267f3902fd009ec7f0176af44aec6a3b Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 26 Nov 2024 14:35:09 -0600
Subject: [PATCH] [libc] Handle differing wavefront sizes correctly in the
AMDHSA loader
Summary:
The AMDGPU backend can handle wavefront sizes of 32 and 64, with the
native hardware preferring one or the other. The user can override the
hardware with `-mwavefrontsize64` or `-mwavefrontsize32` which
previously wasn't handled. We need to know the wavefront size to know
how much memory to allocate and how to index the RPC buffer. There isn't
a good way to do this with ROCm so we just use the LLVM support for
offloading to check this from the image.
---
libc/utils/gpu/loader/amdgpu/CMakeLists.txt | 1 +
.../utils/gpu/loader/amdgpu/amdhsa-loader.cpp | 46 +++++++++++--------
2 files changed, 28 insertions(+), 19 deletions(-)
diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
index 46c5631046ce20..80c5ae357416af 100644
--- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
+++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
@@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS
Object
Option
Support
+ FrontendOffloading
)
add_llvm_executable(amdhsa-loader amdhsa-loader.cpp)
diff --git a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
index 13a13668335471..5a9fe87077328e 100644
--- a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
@@ -28,6 +28,8 @@
#include "hsa/hsa_ext_amd.h"
#endif
+#include "llvm/Frontend/Offloading/Utility.h"
+
#include <atomic>
#include <cstdio>
#include <cstdlib>
@@ -163,17 +165,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
hsa_queue_t *queue, rpc::Server &server,
const LaunchParameters ¶ms,
const char *kernel_name, args_t kernel_args,
- bool print_resource_usage) {
+ uint32_t wavefront_size, bool print_resource_usage) {
// Look up the kernel in the loaded executable.
hsa_executable_symbol_t symbol;
if (hsa_status_t err = hsa_executable_get_symbol_by_name(
executable, kernel_name, &dev_agent, &symbol))
return err;
- uint32_t wavefront_size = 0;
- if (hsa_status_t err = hsa_agent_get_info(
- dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
- handle_error(err);
// Retrieve different properties of the kernel symbol used for launch.
uint64_t kernel;
uint32_t args_size;
@@ -419,6 +417,16 @@ int load(int argc, const char **argv, const char **envp, void *image,
dev_agent, &coarsegrained_pool))
handle_error(err);
+ // The AMDGPU target can change its wavefront size. There currently isn't a
+ // good way to look this up through the HSA API so we use the LLVM interface.
+ uint16_t abi_version;
+ llvm::StringRef image_ref(reinterpret_cast<char *>(image), size);
+ llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map;
+ if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
+ llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) {
+ handle_error(llvm::toString(std::move(err)).c_str());
+ }
+
// Allocate fine-grained memory on the host to hold the pointer array for the
// copied argv and allow the GPU agent to access it.
auto allocator = [&](uint64_t size) -> void * {
@@ -448,10 +456,10 @@ int load(int argc, const char **argv, const char **envp, void *image,
hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
// Allocate finegrained memory for the RPC server and client to share.
- uint32_t wavefront_size = 0;
- if (hsa_status_t err = hsa_agent_get_info(
- dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
- handle_error(err);
+ uint32_t wavefront_size =
+ llvm::max_element(info_map, [](auto &&x, auto &&y) {
+ return x.second.WavefrontSize < y.second.WavefrontSize;
+ })->second.WavefrontSize;
// Set up the RPC server.
void *rpc_buffer;
@@ -513,7 +521,6 @@ int load(int argc, const char **argv, const char **envp, void *image,
if (HSA_STATUS_SUCCESS ==
hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
&dev_agent, &freq_sym)) {
-
void *host_clock_freq;
if (hsa_status_t err =
hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
@@ -553,16 +560,17 @@ int load(int argc, const char **argv, const char **envp, void *image,
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
begin_args_t init_args = {argc, dev_argv, dev_envp};
- if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
- coarsegrained_pool, queue, server,
- single_threaded_params, "_begin.kd",
- init_args, print_resource_usage))
+ if (hsa_status_t err = launch_kernel(
+ dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+ server, single_threaded_params, "_begin.kd", init_args,
+ info_map["_begin"].WavefrontSize, print_resource_usage))
handle_error(err);
start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
if (hsa_status_t err = launch_kernel(
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
- server, params, "_start.kd", args, print_resource_usage))
+ server, params, "_start.kd", args, info_map["_start"].WavefrontSize,
+ print_resource_usage))
handle_error(err);
void *host_ret;
@@ -580,10 +588,10 @@ int load(int argc, const char **argv, const char **envp, void *image,
int ret = *static_cast<int *>(host_ret);
end_args_t fini_args = {ret};
- if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
- coarsegrained_pool, queue, server,
- single_threaded_params, "_end.kd",
- fini_args, print_resource_usage))
+ if (hsa_status_t err = launch_kernel(
+ dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+ server, single_threaded_params, "_end.kd", fini_args,
+ info_map["_end"].WavefrontSize, print_resource_usage))
handle_error(err);
if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer))
More information about the libc-commits
mailing list