[libc-commits] [libc] 8341a40 - [libc] Update the AMDGPU implementation to use code object 5 (#72580)
via libc-commits
libc-commits at lists.llvm.org
Tue Nov 21 05:14:18 PST 2023
Author: Joseph Huber
Date: 2023-11-21T07:14:10-06:00
New Revision: 8341a40ec1993a5d2b75b326720e7b6661f4b36a
URL: https://github.com/llvm/llvm-project/commit/8341a40ec1993a5d2b75b326720e7b6661f4b36a
DIFF: https://github.com/llvm/llvm-project/commit/8341a40ec1993a5d2b75b326720e7b6661f4b36a.diff
LOG: [libc] Update the AMDGPU implementation to use code object 5 (#72580)
Summary:
This patch includes the necessary changes to make the `libc` tests
running on AMD GPUs run using the newer code object version. The 'code
object version' is AMD's internal ABI for making kernel calls. The move
from 4 to 5 changed how we handle arguments for builtins such as
obtaining the grid size or setting up the size of the private stack.
Fixes: https://github.com/llvm/llvm-project/issues/72517
Added:
Modified:
libc/cmake/modules/prepare_libc_gpu_build.cmake
libc/utils/gpu/loader/amdgpu/Loader.cpp
Removed:
################################################################################
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index 0b6067f69775c45..377bc4eeae8fc9b 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -120,8 +120,5 @@ if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
# The AMDGPU environment uses
diff erent code objects to encode the ABI for
# kernel calls and intrinsic functions. We want to specify this manually to
# conform to whatever the test suite was built to handle.
- # FIXME: The test suite currently hangs when compiled targeting version five.
- # This occurrs during traversal of the callback array in the startup code. We
- # deliberately use version four until this can be addressed.
- set(LIBC_GPU_CODE_OBJECT_VERSION 4)
+ set(LIBC_GPU_CODE_OBJECT_VERSION 5)
endif()
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index b1b3aa6ce028ca5..4272cf40bfd1f67 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -34,6 +34,19 @@
#include <tuple>
#include <utility>
+// The implicit arguments of COV5 AMDGPU kernels.
+struct implicit_args_t {
+ uint32_t grid_size_x;
+ uint32_t grid_size_y;
+ uint32_t grid_size_z;
+ uint16_t workgroup_size_x;
+ uint16_t workgroup_size_y;
+ uint16_t workgroup_size_z;
+ uint8_t Unused0[46];
+ uint16_t grid_dims;
+ uint8_t Unused1[190];
+};
+
/// Print the error code and exit if \p code indicates an error.
static void handle_error(hsa_status_t code) {
if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
@@ -185,11 +198,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
uint32_t args_size;
uint32_t group_size;
uint32_t private_size;
+ bool dynamic_stack;
std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
+ {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
for (auto &[info, value] : symbol_infos)
@@ -209,6 +224,19 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
std::memset(args, 0, args_size);
std::memcpy(args, &kernel_args, sizeof(args_t));
+ // Initialize the necessary implicit arguments to the proper values.
+ bool dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
+ (params.num_blocks_z * params.num_threads_z != 1);
+ implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
+ reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
+ implicit_args->grid_dims = dims;
+ implicit_args->grid_size_x = params.num_threads_x;
+ implicit_args->grid_size_y = params.num_threads_y;
+ implicit_args->grid_size_z = params.num_threads_z;
+ implicit_args->workgroup_size_x = params.num_blocks_x;
+ implicit_args->workgroup_size_y = params.num_blocks_y;
+ implicit_args->workgroup_size_z = params.num_blocks_z;
+
// Obtain a packet from the queue.
uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
@@ -222,9 +250,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
// Set up the packet for exeuction on the device. We currently only launch
// with one thread on the device, forcing the rest of the wavefront to be
// masked off.
- uint16_t setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
- (params.num_blocks_z * params.num_threads_z != 1))
- << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+ uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet->workgroup_size_x = params.num_threads_x;
packet->workgroup_size_y = params.num_threads_y;
packet->workgroup_size_z = params.num_threads_z;
@@ -232,7 +258,8 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
- packet->private_segment_size = private_size;
+ packet->private_segment_size =
+ dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
packet->group_segment_size = group_size;
packet->kernel_object = kernel;
packet->kernarg_address = args;
More information about the libc-commits
mailing list