[clang] [Clang] Implement resource directory headers for common GPU intrinsics (PR #110179)

Thu Sep 26 15:21:15 PDT 2024

https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/110179

Summary:
All GPU based languages provide some way to access things like the
thread ID or other resources. However, this is spread between many
different languages and it varies between targets. The goal here is to
provide a resource directory header that just provides these in an
easier to understand way, primarily so this can be used for C/C++ code.
The interface aims to be common, to faciliate easier porting, but target
specific stuff could be put in the individual headers.


>From 8a817316b38e0c9b3847898037e71076de23f92c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 26 Sep 2024 16:47:14 -0500
Subject: [PATCH] [Clang] Implement resource directory headers for common GPU
 intrinsics

Summary:
All GPU based languages provide some way to access things like the
thread ID or other resources. However, this is spread between many
different languages and it varies between targets. The goal here is to
provide a resource directory header that just provides these in an
easier to understand way, primarily so this can be used for C/C++ code.
The interface aims to be common, to faciliate easier porting, but target
specific stuff could be put in the individual headers.
---
 clang/lib/Headers/CMakeLists.txt |  14 +++
 clang/lib/Headers/amdgpuintrin.h | 193 +++++++++++++++++++++++++++++++
 clang/lib/Headers/gpuintrin.h    |  18 +++
 clang/lib/Headers/nvptxintrin.h  | 190 ++++++++++++++++++++++++++++++
 4 files changed, 415 insertions(+)
 create mode 100644 clang/lib/Headers/amdgpuintrin.h
 create mode 100644 clang/lib/Headers/gpuintrin.h
 create mode 100644 clang/lib/Headers/nvptxintrin.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index f5cc07c303f9eb..b439db1b2ac169 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -267,6 +267,12 @@ set(x86_files
   cpuid.h
   )
 
+set(gpu_files
+  gpuintrin.h
+  nvptxintrin.h
+  amdgpuintrin.h
+  )
+
 set(windows_only_files
   intrin0.h
   intrin.h
@@ -295,6 +301,7 @@ set(files
   ${systemz_files}
   ${ve_files}
   ${x86_files}
+  ${gpu_files}
   ${webassembly_files}
   ${windows_only_files}
   ${utility_files}
@@ -517,6 +524,7 @@ add_header_target("systemz-resource-headers" "${systemz_files};${zos_wrapper_fil
 add_header_target("ve-resource-headers" "${ve_files}")
 add_header_target("webassembly-resource-headers" "${webassembly_files}")
 add_header_target("x86-resource-headers" "${x86_files}")
+add_header_target("gpu-resource-headers" "${gpu_files}")
 
 # Other header groupings
 add_header_target("hlsl-resource-headers" ${hlsl_files})
@@ -703,6 +711,12 @@ install(
   EXCLUDE_FROM_ALL
   COMPONENT x86-resource-headers)
 
+install(
+  FILES ${gpu_files}
+  DESTINATION ${header_install_dir}
+  EXCLUDE_FROM_ALL
+  COMPONENT gpu-resource-headers)
+
 if(NOT CLANG_ENABLE_HLSL)
   set(EXCLUDE_HLSL EXCLUDE_FROM_ALL)
 endif()
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
new file mode 100644
index 00000000000000..55f4148d5b9d32
--- /dev/null
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -0,0 +1,193 @@
+//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __AMDGPUINTRIN_H
+#define __AMDGPUINTRIN_H
+
+#ifndef __AMDGPU__
+#error "This file is intended for AMDGPU targets or offloading to AMDGPU
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Type aliases to the address spaces used by the AMDGPU backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Some helpers for OpenCL-like vectors types.
+#define _vector1 __attribute__((ext_vector_type(1)))
+#define _vector2 __attribute__((ext_vector_type(2)))
+#define _vector3 __attribute__((ext_vector_type(3)))
+#define _vector4 __attribute__((ext_vector_type(4)))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((amdgpu_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workgruops in the grid.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// Returns the absolute id of the AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+         _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// Returns the total number of workitems in the workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// Returns the absolute id of the thread in the current AMD workgroup.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+         _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() {
+  return __builtin_amdgcn_wavefrontsize();
+}
+
+// Returns the id of the thread inside of an AMD wavefront executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+// Returns the bit-mask of active threads in the current wavefront.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t _get_lane_mask() {
+  return __builtin_amdgcn_read_exec();
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_broadcast_value(uint64_t, uint32_t x) {
+  return __builtin_amdgcn_readfirstlane(x);
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t
+_ballot(uint64_t lane_mask, bool x) {
+  // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
+  // the active threads
+  return lane_mask & __builtin_amdgcn_ballot_w64(x);
+}
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_threads() {
+  __builtin_amdgcn_s_barrier();
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+}
+
+// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
+_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_lane(uint64_t) {
+  __builtin_amdgcn_wave_barrier();
+}
+
+// Shuffles the the lanes inside the wavefront according to the given index.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_shuffle(uint64_t, uint32_t idx, uint32_t x) {
+  return __builtin_amdgcn_ds_bpermute(idx << 2, x);
+}
+
+// Returns the current value of the GPU's processor clock.
+// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter.
+_DEFAULT_ATTRS static inline uint64_t _processor_clock() {
+  return __builtin_readcyclecounter();
+}
+
+// Returns a fixed-frequency timestamp. The actual frequency is dependent on
+// the card and can only be queried via the driver.
+_DEFAULT_ATTRS static inline uint64_t _fixed_frequency_clock() {
+  return __builtin_readsteadycounter();
+}
+
+// Terminates execution of the associated wavefront.
+_DEFAULT_ATTRS [[noreturn]] static inline void _end_program() {
+  __builtin_amdgcn_endpgm();
+}
+
+#pragma omp end declare variant
+#pragma omp end declare target
+#undef _DEFAULT_ATTRS
+
+#endif // __AMDGPUINTRIN_H
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
new file mode 100644
index 00000000000000..09ca48b5580327
--- /dev/null
+++ b/clang/lib/Headers/gpuintrin.h
@@ -0,0 +1,18 @@
+//===-- gpuintrin.h - Generic GPU intrinsic functions ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __GPUINTRIN_H
+#define __GPUINTRIN_H
+
+#if defined(__NVPTX__)
+#include <nvptxintrin.h>
+#elif defined(__AMDGPU__)
+#include <amdgpuintrin.h>
+#endif
+
+#endif // __GPUINTRIN_H
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
new file mode 100644
index 00000000000000..70de0e003d6dd3
--- /dev/null
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -0,0 +1,190 @@
+//===-- nvptxintrin.h - NVPTX intrinsic functions -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __NVPTXINTRIN_H
+#define __NVPTXINTRIN_H
+
+#ifndef __NVPTX__
+#error "This file is intended for NVPTX targets or offloading to NVPTX
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline))
+#else
+#define _DEFAULT_ATTRS __attribute__((always_inline))
+#endif
+
+#pragma omp begin declare target device_type(nohost)
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+
+// Type aliases to the address spaces used by the NVPTX backend.
+#define _private __attribute__((opencl_private))
+#define _constant __attribute__((opencl_constant))
+#define _local __attribute__((opencl_local))
+#define _global __attribute__((opencl_global))
+
+// Some helpers for OpenCL-like vectors types.
+#define _vector1 __attribute__((ext_vector_type(1)))
+#define _vector2 __attribute__((ext_vector_type(2)))
+#define _vector3 __attribute__((ext_vector_type(3)))
+#define _vector4 __attribute__((ext_vector_type(4)))
+
+// Attribute to declare a function as a kernel.
+#define _kernel __attribute__((nvptx_kernel))
+
+// Returns the number of CUDA blocks in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() {
+  return __nvvm_read_ptx_sreg_nctaid_x();
+}
+
+// Returns the number of CUDA blocks in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() {
+  return __nvvm_read_ptx_sreg_nctaid_y();
+}
+
+// Returns the number of CUDA blocks in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() {
+  return __nvvm_read_ptx_sreg_nctaid_z();
+}
+
+// Returns the total number of CUDA blocks.
+_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() {
+  return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() {
+  return __nvvm_read_ptx_sreg_ctaid_x();
+}
+
+// Returns the 'y' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() {
+  return __nvvm_read_ptx_sreg_ctaid_y();
+}
+
+// Returns the 'z' dimension of the current CUDA block's id.
+_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() {
+  return __nvvm_read_ptx_sreg_ctaid_z();
+}
+
+// Returns the absolute id of the CUDA block.
+_DEFAULT_ATTRS static inline uint64_t _get_block_id() {
+  return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() +
+         _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z();
+}
+
+// Returns the number of CUDA threads in the 'x' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() {
+  return __nvvm_read_ptx_sreg_ntid_x();
+}
+
+// Returns the number of CUDA threads in the 'y' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() {
+  return __nvvm_read_ptx_sreg_ntid_y();
+}
+
+// Returns the number of CUDA threads in the 'z' dimension.
+_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() {
+  return __nvvm_read_ptx_sreg_ntid_z();
+}
+
+// Returns the total number of threads in the block.
+_DEFAULT_ATTRS static inline uint64_t _get_num_threads() {
+  return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() {
+  return __nvvm_read_ptx_sreg_tid_x();
+}
+
+// Returns the 'y' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() {
+  return __nvvm_read_ptx_sreg_tid_y();
+}
+
+// Returns the 'z' dimension id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() {
+  return __nvvm_read_ptx_sreg_tid_z();
+}
+
+// Returns the absolute id of the thread in the current CUDA block.
+_DEFAULT_ATTRS static inline uint64_t _get_thread_id() {
+  return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() +
+         _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z();
+}
+
+// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
+_DEFAULT_ATTRS static inline uint32_t _get_lane_size() { return 32; }
+
+// Returns the id of the thread inside of a CUDA warp executing together.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() {
+  return __nvvm_read_ptx_sreg_laneid();
+}
+
+// Returns the bit-mask of active threads in the current warp.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t _get_lane_mask() {
+  return __nvvm_activemask();
+}
+
+// Copies the value from the first active thread in the warp to the rest.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_broadcast_value(uint64_t lane_mask, uint32_t x) {
+  uint32_t mask = static_cast<uint32_t>(lane_mask);
+  uint32_t id = __builtin_ffs(mask) - 1;
+  return __nvvm_shfl_sync_idx_i32(mask, x, id, _get_lane_size() - 1);
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t
+_ballot(uint64_t lane_mask, bool x) {
+  uint32_t mask = static_cast<uint32_t>(lane_mask);
+  return __nvvm_vote_ballot_sync(mask, x);
+}
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_threads() {
+  __syncthreads();
+}
+
+// Waits for all threads in the warp to reconverge for independent scheduling.
+_DEFAULT_ATTRS [[clang::convergent]] static inline void
+_sync_lane(uint64_t mask) {
+  __nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
+}
+
+// Shuffles the the lanes inside the warp according to the given index.
+_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
+_shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
+  uint32_t mask = static_cast<uint32_t>(lane_mask);
+  uint32_t bitmask = (mask >> idx) & 1;
+  return -bitmask &
+         __nvvm_shfl_sync_idx_i32(mask, x, idx, _get_lane_size() - 1);
+}
+
+// Returns the current value of the GPU's processor clock.
+_DEFAULT_ATTRS static inline uint64_t _processor_clock() {
+  return __builtin_readcyclecounter();
+}
+
+// Returns a global fixed-frequency timer at nanosecond frequency.
+_DEFAULT_ATTRS static inline uint64_t _fixed_frequency_clock() {
+  return __builtin_readsteadycounter();
+}
+
+// Terminates execution of the calling thread.
+_DEFAULT_ATTRS [[noreturn]] static inline void _end_program() { __nvvm_exit(); }
+
+#pragma omp end declare variant
+#pragma omp end declare target
+#undef _DEFAULT_ATTRS
+
+#endif // __NVPTXINTRIN_H