[libc-commits] [libc] 50445df - [libc] Add more utility functions for the GPU

Mon Apr 24 13:48:03 PDT 2023

Author: Joseph Huber
Date: 2023-04-24T15:47:53-05:00
New Revision: 50445dff43037014a23eb38b1f50bb698e64ffcf

URL: https://github.com/llvm/llvm-project/commit/50445dff43037014a23eb38b1f50bb698e64ffcf
DIFF: https://github.com/llvm/llvm-project/commit/50445dff43037014a23eb38b1f50bb698e64ffcf.diff

LOG: [libc] Add more utility functions for the GPU

This patch adds extra intrinsics for the GPU. Some of these are unused
for now but will be used later. We use these currently to update the
`RPC` handling. Currently, every thread can update the RPC client, which
isn't correct. This patch adds code neccesary to allow a single thread
to perfrom the write while the others wait.

Feedback is welcome for the naming of these functions. I'm copying the
OpenMP nomenclature where we call an AMD `wavefront` or NVIDIA `warp` a
`lane`.

Reviewed By: tra

Differential Revision: https://reviews.llvm.org/D148810

Added: 
    

Modified: 
    libc/src/__support/GPU/amdgpu/utils.h
    libc/src/__support/GPU/generic/utils.h
    libc/src/__support/GPU/nvptx/utils.h
    libc/src/__support/RPC/CMakeLists.txt
    libc/startup/gpu/amdgpu/CMakeLists.txt
    libc/startup/gpu/amdgpu/start.cpp
    libc/startup/gpu/nvptx/CMakeLists.txt
    libc/startup/gpu/nvptx/start.cpp
    libc/test/integration/startup/gpu/rpc_test.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index be90cb3edbc7a..a4ac7d26f0d91 100644

--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -14,11 +14,114 @@
 #include <stdint.h>
 
 namespace __llvm_libc {
+namespace gpu {
 
+/// The number of threads that execute in lock-step in a lane.
+constexpr const uint64_t LANE_SIZE = __AMDGCN_WAVEFRONT_SIZE;
+
+/// Returns the number of workgroups in the 'x' dimension of the grid.
+LIBC_INLINE uint32_t get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+/// Returns the number of workgroups in the 'y' dimension of the grid.
+LIBC_INLINE uint32_t get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+/// Returns the number of workgroups in the 'z' dimension of the grid.
+LIBC_INLINE uint32_t get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+/// Returns the 'x' dimension of the current AMD workgroup's id.
 LIBC_INLINE uint32_t get_block_id_x() {
   return __builtin_amdgcn_workgroup_id_x();
 }
 
+/// Returns the 'y' dimension of the current AMD workgroup's id.
+LIBC_INLINE uint32_t get_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+/// Returns the 'z' dimension of the current AMD workgroup's id.
+LIBC_INLINE uint32_t get_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+/// Returns the absolute id of the AMD workgroup.
+LIBC_INLINE uint64_t get_block_id() {
+  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
+         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
+}
+
+/// Returns the number of workitems in the 'x' dimension.
+LIBC_INLINE uint32_t get_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+/// Returns the number of workitems in the 'y' dimension.
+LIBC_INLINE uint32_t get_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+/// Returns the number of workitems in the 'z' dimension.
+LIBC_INLINE uint32_t get_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+/// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+LIBC_INLINE uint32_t get_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+/// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+LIBC_INLINE uint32_t get_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+/// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+LIBC_INLINE uint32_t get_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+/// Returns the absolute id of the thread in the current AMD workgroup.
+LIBC_INLINE uint64_t get_thread_id() {
+  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
+         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
+}
+
+/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware.
+LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+
+/// Returns the id of the thread inside of an AMD wavefront executing together.
+[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
+  if (LANE_SIZE == 64)
+    return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+  else
+    return __builtin_amdgcn_mbcnt_lo(~0u, 0u);
+}
+
+/// Returns the bit-mask of active threads in the current wavefront.
+[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
+  return __builtin_amdgcn_read_exec();
+}
+
+/// Copies the value from the first active thread in the wavefront to the rest.
+[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
+  return __builtin_amdgcn_readfirstlane(x);
+}
+
+/// Waits for all the threads in the block to converge and issues a fence.
+[[clang::convergent]] LIBC_INLINE void sync_threads() {
+  __builtin_amdgcn_s_barrier();
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+}
+
+/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
+[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {}
+
+} // namespace gpu
 } // namespace __llvm_libc
 
 #endif

diff  --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
index d54551f953712..20e1b16ec25ba 100644
--- a/libc/src/__support/GPU/generic/utils.h
+++ b/libc/src/__support/GPU/generic/utils.h
@@ -14,9 +14,51 @@
 #include <stdint.h>
 
 namespace __llvm_libc {
+namespace gpu {
+
+constexpr const uint64_t LANE_SIZE = 1;
+
+LIBC_INLINE uint32_t get_num_blocks_x() { return 1; }
+
+LIBC_INLINE uint32_t get_num_blocks_y() { return 0; }
+
+LIBC_INLINE uint32_t get_num_blocks_z() { return 0; }
 
 LIBC_INLINE uint32_t get_block_id_x() { return 0; }
 
+LIBC_INLINE uint32_t get_block_id_y() { return 0; }
+
+LIBC_INLINE uint32_t get_block_id_z() { return 0; }
+
+LIBC_INLINE uint64_t get_block_id() { return 0; }
+
+LIBC_INLINE uint32_t get_num_threads_x() { return 1; }
+
+LIBC_INLINE uint32_t get_num_threads_y() { return 0; }
+
+LIBC_INLINE uint32_t get_num_threads_z() { return 0; }
+
+LIBC_INLINE uint32_t get_thread_id_x() { return 0; }
+
+LIBC_INLINE uint32_t get_thread_id_y() { return 0; }
+
+LIBC_INLINE uint32_t get_thread_id_z() { return 0; }
+
+LIBC_INLINE uint64_t get_thread_id() { return 0; }
+
+LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+
+LIBC_INLINE uint32_t get_lane_id() { return 0; }
+
+LIBC_INLINE uint64_t get_lane_mask() { return 1; }
+
+LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; }
+
+LIBC_INLINE void sync_threads() {}
+
+LIBC_INLINE void sync_lane(uint64_t) {}
+
+} // namespace gpu
 } // namespace __llvm_libc
 
 #endif

diff  --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
index fa361cdbdf5c5..88544db85b9f0 100644
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -2,7 +2,7 @@
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-License-id: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
@@ -14,9 +14,113 @@
 #include <stdint.h>
 
 namespace __llvm_libc {
+namespace gpu {
 
+/// The number of threads that execute in lock-step in a warp.
+constexpr const uint64_t LANE_SIZE = 32;
+
+/// Returns the number of CUDA blocks in the 'x' dimension.
+LIBC_INLINE uint32_t get_num_blocks_x() {
+  return __nvvm_read_ptx_sreg_nctaid_x();
+}
+
+/// Returns the number of CUDA blocks in the 'y' dimension.
+LIBC_INLINE uint32_t get_num_blocks_y() {
+  return __nvvm_read_ptx_sreg_nctaid_y();
+}
+
+/// Returns the number of CUDA blocks in the 'z' dimension.
+LIBC_INLINE uint32_t get_num_blocks_z() {
+  return __nvvm_read_ptx_sreg_nctaid_z();
+}
+
+/// Returns the 'x' dimension of the current CUDA block's id.
 LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); }
 
+/// Returns the 'y' dimension of the current CUDA block's id.
+LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); }
+
+/// Returns the 'z' dimension of the current CUDA block's id.
+LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); }
+
+/// Returns the absolute id of the CUDA block.
+LIBC_INLINE uint64_t get_block_id() {
+  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
+         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
+}
+
+/// Returns the number of CUDA threads in the 'x' dimension.
+LIBC_INLINE uint32_t get_num_threads_x() {
+  return __nvvm_read_ptx_sreg_ntid_x();
+}
+
+/// Returns the number of CUDA threads in the 'y' dimension.
+LIBC_INLINE uint32_t get_num_threads_y() {
+  return __nvvm_read_ptx_sreg_ntid_y();
+}
+
+/// Returns the number of CUDA threads in the 'z' dimension.
+LIBC_INLINE uint32_t get_num_threads_z() {
+  return __nvvm_read_ptx_sreg_ntid_z();
+}
+
+/// Returns the 'x' dimension id of the thread in the current CUDA block.
+LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); }
+
+/// Returns the 'y' dimension id of the thread in the current CUDA block.
+LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); }
+
+/// Returns the 'z' dimension id of the thread in the current CUDA block.
+LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); }
+
+/// Returns the absolute id of the thread in the current CUDA block.
+LIBC_INLINE uint64_t get_thread_id() {
+  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
+         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
+}
+
+/// Returns the size of a CUDA warp.
+LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+
+/// Returns the id of the thread inside of a CUDA warp executing together.
+[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
+  return get_thread_id() & (get_lane_size() - 1);
+}
+
+/// Returns the bit-mask of active threads in the current warp.
+[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
+  uint32_t mask;
+  asm volatile("activemask.b32 %0;" : "=r"(mask));
+  return mask;
+}
+
+/// Copies the value from the first active thread in the warp to the rest.
+[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
+  // NOTE: This is not sufficient in all cases on Volta hardware or later. The
+  // lane mask returned here is not always the true lane mask used by the
+  // intrinsics in cases of incedental or enforced divergence by the user.
+  uint64_t lane_mask = get_lane_mask();
+  uint64_t id = __builtin_ffsl(lane_mask) - 1;
+#if __CUDA_ARCH__ >= 600
+  return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1);
+#else
+  return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);
+#endif
+}
+
+/// Waits for all the threads in the block to converge and issues a fence.
+[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
+
+/// Waits for all threads in the warp to reconverge for independent scheduling.
+[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
+#if __CUDA_ARCH__ >= 700
+  __nvvm_bar_warp_sync(mask);
+#else
+  (void)mask;
+#endif
+}
+
+} // namespace gpu
 } // namespace __llvm_libc
 
 #endif

diff  --git a/libc/src/__support/RPC/CMakeLists.txt b/libc/src/__support/RPC/CMakeLists.txt
index 6bcec0f6526d6..1ac2a3b0548cb 100644
--- a/libc/src/__support/RPC/CMakeLists.txt
+++ b/libc/src/__support/RPC/CMakeLists.txt
@@ -20,5 +20,6 @@ add_object_library(
   HDRS
     rpc_client.h
   DEPENDS
+    libc.src.__support.GPU.utils
     .rpc
 )

diff  --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt
index 891d20993b080..a9f33af6d79ed 100644
--- a/libc/startup/gpu/amdgpu/CMakeLists.txt
+++ b/libc/startup/gpu/amdgpu/CMakeLists.txt
@@ -4,6 +4,7 @@ add_startup_object(
     start.cpp
   DEPENDS
     libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin

diff  --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index 66f06b086a233..e8b5029f2a760 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -6,16 +6,38 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 
-static __llvm_libc::cpp::Atomic<uint32_t> lock;
-
 extern "C" int main(int argc, char **argv, char **envp);
 
+namespace __llvm_libc {
+
+static cpp::Atomic<uint32_t> lock = 0;
+
+static cpp::Atomic<uint32_t> init = 0;
+
+void init_rpc(void *in, void *out, void *buffer) {
+  // Only a single thread should update the RPC data.
+  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    rpc::client.reset(&lock, in, out, buffer);
+    init.store(1, cpp::MemoryOrder::RELAXED);
+  }
+
+  // Wait until the previous thread signals that the data has been written.
+  while (!init.load(cpp::MemoryOrder::RELAXED))
+    rpc::sleep_briefly();
+
+  // Wait for the threads in the block to converge and fence the write.
+  gpu::sync_threads();
+}
+
+} // namespace __llvm_libc
+
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
-  __llvm_libc::rpc::client.reset(&lock, in, out, buffer);
+  __llvm_libc::init_rpc(in, out, buffer);
 
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }

diff  --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
index 49661691ecb57..b8a9f49d5be53 100644
--- a/libc/startup/gpu/nvptx/CMakeLists.txt
+++ b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -5,6 +5,7 @@ add_startup_object(
     start.cpp
   DEPENDS
     libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin

diff  --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index 9939c6e21330b..7b88e30f7f370 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -6,16 +6,38 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 
-static __llvm_libc::cpp::Atomic<uint32_t> lock;
-
 extern "C" int main(int argc, char **argv, char **envp);
 
-extern "C" [[gnu::visibility("protected")]] __attribute__((nvptx_kernel)) void
+namespace __llvm_libc {
+
+static cpp::Atomic<uint32_t> lock = 0;
+
+static cpp::Atomic<uint32_t> init = 0;
+
+void init_rpc(void *in, void *out, void *buffer) {
+  // Only a single thread should update the RPC data.
+  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    rpc::client.reset(&lock, in, out, buffer);
+    init.store(1, cpp::MemoryOrder::RELAXED);
+  }
+
+  // Wait until the previous thread signals that the data has been written.
+  while (!init.load(cpp::MemoryOrder::RELAXED))
+    rpc::sleep_briefly();
+
+  // Wait for the threads in the block to converge and fence the write.
+  gpu::sync_threads();
+}
+
+} // namespace __llvm_libc
+
+extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
-  __llvm_libc::rpc::client.reset(&lock, in, out, buffer);
+  __llvm_libc::init_rpc(in, out, buffer);
 
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }

diff  --git a/libc/test/integration/startup/gpu/rpc_test.cpp b/libc/test/integration/startup/gpu/rpc_test.cpp
index 0d3f13718d594..0b23ee37f9983 100644
--- a/libc/test/integration/startup/gpu/rpc_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_test.cpp
@@ -13,7 +13,7 @@
 using namespace __llvm_libc;
 
 static void test_add_simple() {
-  uint32_t num_additions = 1000 + 10 * get_block_id_x();
+  uint32_t num_additions = 1000 + 10 * gpu::get_block_id_x();
   uint64_t cnt = 0;
   for (uint32_t i = 0; i < num_additions; ++i) {
     rpc::Port port = rpc::client.open(rpc::TEST_INCREMENT);