[libc-commits] [libc] [libc] Update the AMDGPU implementation to use code object 5 (PR #72580)

Tue Nov 21 04:47:38 PST 2023

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/72580

>From 3e0bbbfe2bf564c17b57ba79030f67463d9c2cab Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 16 Nov 2023 16:22:00 -0600
Subject: [PATCH] [libc] Update the AMDGPU implementation to use code object 5

Summary:
This patch includes the necessary changes to make the `libc` tests
running on AMD GPUs run using the newer code object version. The 'code
object version' is AMD's internal ABI for making kernel calls. The move
from 4 to 5 changed how we handle arguments for builtins such as
obtaining the grid size or setting up the size of the private stack.

Fixes: https://github.com/llvm/llvm-project/issues/72517
---
 .../modules/prepare_libc_gpu_build.cmake      |  5 +--
 libc/utils/gpu/loader/amdgpu/Loader.cpp       | 35 ++++++++++++++++---
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index 0b6067f69775c45..377bc4eeae8fc9b 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -120,8 +120,5 @@ if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
   # The AMDGPU environment uses different code objects to encode the ABI for
   # kernel calls and intrinsic functions. We want to specify this manually to
   # conform to whatever the test suite was built to handle.
-  # FIXME: The test suite currently hangs when compiled targeting version five.
-  # This occurrs during traversal of the callback array in the startup code. We
-  # deliberately use version four until this can be addressed.
-  set(LIBC_GPU_CODE_OBJECT_VERSION 4)
+  set(LIBC_GPU_CODE_OBJECT_VERSION 5)
 endif()
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index b1b3aa6ce028ca5..4272cf40bfd1f67 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -34,6 +34,19 @@
 #include <tuple>
 #include <utility>
 
+// The implicit arguments of COV5 AMDGPU kernels.
+struct implicit_args_t {
+  uint32_t grid_size_x;
+  uint32_t grid_size_y;
+  uint32_t grid_size_z;
+  uint16_t workgroup_size_x;
+  uint16_t workgroup_size_y;
+  uint16_t workgroup_size_z;
+  uint8_t Unused0[46];
+  uint16_t grid_dims;
+  uint8_t Unused1[190];
+};
+
 /// Print the error code and exit if \p code indicates an error.
 static void handle_error(hsa_status_t code) {
   if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
@@ -185,11 +198,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   uint32_t args_size;
   uint32_t group_size;
   uint32_t private_size;
+  bool dynamic_stack;
 
   std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
+      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
 
   for (auto &[info, value] : symbol_infos)
@@ -209,6 +224,19 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   std::memset(args, 0, args_size);
   std::memcpy(args, &kernel_args, sizeof(args_t));
 
+  // Initialize the necessary implicit arguments to the proper values.
+  bool dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
+              (params.num_blocks_z * params.num_threads_z != 1);
+  implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
+      reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
+  implicit_args->grid_dims = dims;
+  implicit_args->grid_size_x = params.num_threads_x;
+  implicit_args->grid_size_y = params.num_threads_y;
+  implicit_args->grid_size_z = params.num_threads_z;
+  implicit_args->workgroup_size_x = params.num_blocks_x;
+  implicit_args->workgroup_size_y = params.num_blocks_y;
+  implicit_args->workgroup_size_z = params.num_blocks_z;
+
   // Obtain a packet from the queue.
   uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
   while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
@@ -222,9 +250,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   // Set up the packet for exeuction on the device. We currently only launch
   // with one thread on the device, forcing the rest of the wavefront to be
   // masked off.
-  uint16_t setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
-                    (params.num_blocks_z * params.num_threads_z != 1))
-                   << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+  uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
   packet->workgroup_size_x = params.num_threads_x;
   packet->workgroup_size_y = params.num_threads_y;
   packet->workgroup_size_z = params.num_threads_z;
@@ -232,7 +258,8 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
   packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
   packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
-  packet->private_segment_size = private_size;
+  packet->private_segment_size =
+      dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
   packet->group_segment_size = group_size;
   packet->kernel_object = kernel;
   packet->kernarg_address = args;